[ { "id": "B11bwYgfM", "title": "Robust Task Clustering for Deep and Diverse Multi-Task and Few-Shot Learning", "track": "main", "status": "Withdraw", "tldr": "We propose a matrix-completion based task clustering algorithm for deep multi-task and few-shot learning in the settings with large numbers of diverse tasks.", "abstract": "We investigate task clustering for deep learning-based multi-task and few-shot learning in the settings with large numbers of diverse tasks. Our method measures task similarities using cross-task transfer performance matrix. Although this matrix provides us critical information regarding similarities between tasks, the uncertain task-pairs, i.e., the ones with extremely asymmetric transfer scores, may collectively mislead clustering algorithms to output an inaccurate task-partition. Moreover, when the number of tasks is large, generating the full transfer performance matrix can be very time consuming. To overcome these limitations, we propose a novel task clustering algorithm to estimate the similarity matrix based on the theory of matrix completion. The proposed algorithm can work on partially-observed similarity matrices based on only sampled task-pairs with reliable scores, ensuring its efficiency and robustness. Our theoretical analysis shows that under mild assumptions, the reconstructed matrix perfectly matches the underlying \u201ctrue\u201d similarity matrix with an overwhelming probability. The final task partition is computed by applying an efficient spectral clustering algorithm to the recovered matrix. Our results show that the new task clustering method can discover task clusters that benefit both multi-task learning and few-shot learning setups for sentiment classification and dialog intent classification tasks.", "keywords": "task clustering;matrix completion;multi-task learning;few-shot learning", "primary_area": "", "supplementary_material": "", "author": "Mo Yu;Xiaoxiao Guo;Jinfeng Yi;Shiyu Chang;Saloni Potdar;Gerald Tesauro;Haoyu Wang;Bowen Zhou", "authorids": ";xiaoxiao.guo@ibm.com;jinfengyi.ustc@gmail.com;;;;;", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B11bwYgfM", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 3, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13861871824681127552&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Learning to Count Objects in Natural Images for Visual Question Answering", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/307", "id": "B12Js_yRb", "author_site": "Yan Zhang, Jonathon Hare, Adam Prugel-Bennett", "tldr": "Enabling Visual Question Answering models to count by handling overlapping object proposals.", "abstract": "Visual Question Answering (VQA) models have struggled with counting objects in natural images so far. We identify a fundamental problem due to soft attention in these models as a cause. To circumvent this problem, we propose a neural network component that allows robust counting from object proposals. Experiments on a toy task show the effectiveness of this component and we obtain state-of-the-art accuracy on the number category of the VQA v2 dataset without negatively affecting other categories, even outperforming ensemble models with our single model. On a difficult balanced pair metric, the component gives a substantial improvement in counting over a strong baseline by 6.6%.", "keywords": "visual question answering;vqa;counting", "primary_area": "", "supplementary_material": "", "author": "Yan Zhang;Jonathon Hare;Adam Pr\u00fcgel-Bennett", "authorids": "yz5n12@ecs.soton.ac.uk;jsh2@ecs.soton.ac.uk;apb@ecs.soton.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nzhang2018learning,\ntitle={Learning to Count Objects in Natural Images for Visual Question Answering},\nauthor={Yan Zhang and Jonathon Hare and Adam Pr\u00fcgel-Bennett},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B12Js_yRb},\n}", "github": "[![github](/images/github_icon.svg) Cyanogenoid/vqa-counting](https://github.com/Cyanogenoid/vqa-counting)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;3;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 17, "authors#_avg": 3, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 260, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5291501502665174038&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=B12Js_yRb", "pdf": "https://openreview.net/pdf?id=B12Js_yRb", "email": ";;", "author_num": 3 }, { "id": "B12QlQWRW", "title": "THINK VISUALLY: QUESTION ANSWERING THROUGH VIRTUAL IMAGERY", "track": "main", "status": "Active", "tldr": "", "abstract": "In this paper, we study the problem of visual reasoning in the context of textual question answering. We introduce Dynamic Spatial Memory Networks (DSMN), a new deep network architecture that specializes in answering questions that admit latent visual representations, and learns to generate and reason over such representations. Further, we propose two synthetic benchmarks, HouseQA and ShapeIntersection, to evaluate the visual reasoning capability of textual QA systems. Experimental results validate the effectiveness of our proposed DSMN for visual reasoning tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anonymous", "authorids": "ICLR.cc/2018/Conference/Paper1107/Authors", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@article{\n anonymous2018think,\n title={THINK VISUALLY: QUESTION ANSWERING THROUGH VIRTUAL IMAGERY},\n author={Anonymous},\n journal={International Conference on Learning Representations},\n year={2018},\n url={https://openreview.net/forum?id=B12QlQWRW}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=B12QlQWRW", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 0, "authors#_avg": 1, "corr_rating_confidence": 0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16407675997585537579&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "id": "B13EC5u6W", "title": "Thinking like a machine \u2014 generating visual rationales through latent space optimization", "track": "main", "status": "Reject", "tldr": "We propose a method of using GANs to generate high quality visual rationales to help explain model predictions. ", "abstract": "Interpretability and small labelled datasets are key issues in the practical application of deep learning, particularly in areas such as medicine. In this paper, we present a semi-supervised technique that addresses both these issues simultaneously. We learn dense representations from large unlabelled image datasets, then use those representations to both learn classifiers from small labeled sets and generate visual rationales explaining the predictions. Using chest radiography diagnosis as a motivating application, we show our method has good generalization ability by learning to represent our chest radiography dataset while training a classifier on an separate set from a different institution. Our method identifies heart failure and other thoracic diseases. For each prediction, we generate visual rationales for positive classifications by optimizing a latent representation to minimize the probability of disease while constrained by a similarity measure in image space. Decoding the resultant latent representation produces an image without apparent disease. The difference between the original and the altered image forms an interpretable visual rationale for the algorithm's prediction. Our method simultaneously produces visual rationales that compare favourably to previous techniques and a classifier that outperforms the current state-of-the-art.", "keywords": "interpretability;generative adversarial networks", "primary_area": "", "supplementary_material": "", "author": "Jarrel Seah;Jennifer Tang;Andy Kitchen;Jonathan Seah", "authorids": "jarrelscy@gmail.com;jarrelscy@gmail.com;jarrelscy@gmail.com;jarrelscy@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nseah2018thinking,\ntitle={Thinking like a machine \u2014 generating visual rationales through latent space optimization},\nauthor={Jarrel Seah and Jennifer Tang and Andy Kitchen and Jonathan Seah},\nyear={2018},\nurl={https://openreview.net/forum?id=B13EC5u6W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B13EC5u6W", "pdf_size": 0, "rating": "4;7;8", "confidence": "3;4;2", "rating_avg": 6.333333333333333, "confidence_avg": 3.0, "replies_avg": 14, "authors#_avg": 4, "corr_rating_confidence": -0.2401922307076307, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:sdF1egjOtJcJ:scholar.google.com/&scioq=Thinking+like+a+machine+%E2%80%94+generating+visual+rationales+through+latent+space+optimization&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Progressive Reinforcement Learning with Distillation for Multi-Skilled Motion Control", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/300", "id": "B13njo1R-", "author_site": "Glen Berseth, Cheng Xie, Paul Cernek, Michiel van de Panne", "tldr": "A continual learning method that uses distillation to combine expert policies and transfer learning to accelerate learning new skills.", "abstract": "Deep reinforcement learning has demonstrated increasing capabilities for continuous control problems,\nincluding agents that can move with skill and agility through their environment. \nAn open problem in this setting is that of developing good strategies for integrating or merging policies\nfor multiple skills, where each individual skill is a specialist in a specific skill and its associated state distribution. \nWe extend policy distillation methods to the continuous action setting and leverage this technique to combine \\expert policies,\nas evaluated in the domain of simulated bipedal locomotion across different classes of terrain.\nWe also introduce an input injection method for augmenting an existing policy network to exploit new input features.\nLastly, our method uses transfer learning to assist in the efficient acquisition of new skills.\nThe combination of these methods allows a policy to be incrementally augmented with new skills.\nWe compare our progressive learning and integration via distillation (PLAID) method\nagainst three alternative baselines.", "keywords": "Reinforcement Learning;Distillation;Transfer Learning;Continual Learning", "primary_area": "", "supplementary_material": "", "author": "Glen Berseth;Cheng Xie;Paul Cernek;Michiel Van de Panne", "authorids": "gberseth@gmail.com;cheng.k.xie@gmail.com;pcernek@cs.ubc.ca;van@cs.ubc.ca", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nberseth2018progressive,\ntitle={Progressive Reinforcement Learning with Distillation for Multi-Skilled Motion Control},\nauthor={Glen Berseth and Cheng Xie and Paul Cernek and Michiel Van de Panne},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B13njo1R-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;7;7", "confidence": "3;4;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10887191006544624905&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=B13njo1R-", "pdf": "https://openreview.net/pdf?id=B13njo1R-", "email": ";;;", "author_num": 4 }, { "id": "B14TlG-RW", "title": "QANet: Combining Local Convolution with Global Self-Attention for Reading Comprehension", "track": "main", "status": "Poster", "tldr": "A simple architecture consisting of convolutions and attention achieves results on par with the best documented recurrent models.", "abstract": " Current end-to-end machine reading and question answering (Q\\&A) models are primarily based on recurrent neural networks (RNNs) with attention. Despite their success, these models are often slow for both training and inference due to the sequential nature of RNNs. We propose a new Q\\&A architecture called QANet, which does not require recurrent networks: Its encoder consists exclusively of convolution and self-attention, where convolution models local interactions and self-attention models global interactions. On the SQuAD dataset, our model is 3x to 13x faster in training and 4x to 9x faster in inference, while achieving equivalent accuracy to recurrent models. The speed-up gain allows us to train the model with much more data. We hence combine our model with data generated by backtranslation from a neural machine translation model. \nOn the SQuAD dataset, our single model, trained with augmented data, achieves 84.6 F1 score on the test set, which is significantly better than the best published F1 score of 81.8.", "keywords": "squad;stanford question answering dataset;reading comprehension;attention;text convolutions;question answering", "primary_area": "", "supplementary_material": "", "author": "Adams Wei Yu;David Dohan;Minh-Thang Luong;Rui Zhao;Kai Chen;Mohammad Norouzi;Quoc V. Le", "authorids": "weiyu@cs.cmu.edu;ddohan@google.com;thangluong@google.com;rzhao@google.com;kaichen@google.com;mnorouzi@google.com;qvl@google.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nwei2018fast,\ntitle={Fast and Accurate Reading Comprehension by Combining Self-Attention and Convolution},\nauthor={Adams Wei Yu and David Dohan and Quoc Le and Thang Luong and Rui Zhao and Kai Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B14TlG-RW},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 15 community implementations](https://paperswithcode.com/paper/?openreview=B14TlG-RW)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B14TlG-RW", "pdf_size": 0, "rating": "5;6;8", "confidence": "4;3;5", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 18, "authors#_avg": 7, "corr_rating_confidence": 0.6546536707079772, "gs_citation": 1263, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15745561136241294753&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "B14uJzW0b", "title": "No Spurious Local Minima in a Two Hidden Unit ReLU Network", "track": "main", "status": "Workshop", "tldr": "Recovery guarantee of stochastic gradient descent with random initialization for learning a two-layer neural network with two hidden nodes, unit-norm weights, ReLU activation functions and Gaussian inputs.", "abstract": "Deep learning models can be efficiently optimized via stochastic gradient descent, but there is little theoretical evidence to support this. A key question in optimization is to understand when the optimization landscape of a neural network is amenable to gradient-based optimization. We focus on a simple neural network two-layer ReLU network with two hidden units, and show that all local minimizers are global. This combined with recent work of Lee et al. (2017); Lee et al. (2016) show that gradient descent converges to the global minimizer.", "keywords": "Non-convex optimization;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Chenwei Wu;Jiajun Luo;Jason D. Lee", "authorids": "wucw14@mails.tsinghua.edu.cn;jiajunlu@usc.edu;jasonlee@marshall.usc.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nwu2018no,\ntitle={No Spurious Local Minima in a Two Hidden Unit Re{LU} Network},\nauthor={Chenwei Wu and Jiajun Luo and Jason D. Lee},\nyear={2018},\nurl={https://openreview.net/forum?id=B14uJzW0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B14uJzW0b", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;3;2", "rating_avg": 5.333333333333333, "confidence_avg": 3.0, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13443636241312263319&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "B16_iGWCW", "title": "Deep Boosting of Diverse Experts", "track": "main", "status": "Reject", "tldr": " A deep boosting algorithm is developed to learn more discriminative ensemble classifier by seamlessly combining a set of base deep CNNs.", "abstract": "In this paper, a deep boosting algorithm is developed to\nlearn more discriminative ensemble classifier by seamlessly combining a set of base deep CNNs (base experts)\nwith diverse capabilities, e.g., these base deep CNNs are\nsequentially trained to recognize a set of \nobject classes in an easy-to-hard way according to their\nlearning complexities. Our experimental results have demonstrated\nthat our deep boosting algorithm can significantly improve the\naccuracy rates on large-scale visual recognition.", "keywords": "boosting learning;deep learning;neural network", "primary_area": "", "supplementary_material": "", "author": "Wei Zhang;Qiuyu Chen;Jun Yu;Jianping Fan", "authorids": "weizh@fudan.edu.cn;qchen12@uncc.edu;yujun@hdu.edu.cn;jfan@uncc.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhang2018deep,\ntitle={Deep Boosting of Diverse Experts},\nauthor={Wei Zhang and Qiuyu Chen and Jun Yu and Jianping Fan},\nyear={2018},\nurl={https://openreview.net/forum?id=B16_iGWCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=B16_iGWCW", "pdf_size": 0, "rating": "2;5;6", "confidence": "5;4;3", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 4, "corr_rating_confidence": -0.9607689228305228, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4152121833963402122&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "B16yEqkCZ", "title": "Avoiding Catastrophic States with Intrinsic Fear", "track": "main", "status": "Reject", "tldr": "Shape reward with intrinsic motivation to avoid catastrophic states and mitigate catastrophic forgetting.", "abstract": "Many practical reinforcement learning problems contain catastrophic states that the optimal policy visits infrequently or never. Even on toy problems, deep reinforcement learners periodically revisit these states, once they are forgotten under a new policy. In this paper, we introduce intrinsic fear, a learned reward shaping that accelerates deep reinforcement learning and guards oscillating policies against periodic catastrophes. Our approach incorporates a second model trained via supervised learning to predict the probability of imminent catastrophe. This score acts as a penalty on the Q-learning objective. Our theoretical analysis demonstrates that the perturbed objective yields the same average return under strong assumptions and an $\\epsilon$-close average return under weaker assumptions. Our analysis also shows robustness to classification errors. Equipped with intrinsic fear, our DQNs solve the toy environments and improve on the Atari games Seaquest, Asteroids, and Freeway.", "keywords": "reinforcement learning;safe exploration;dqn", "primary_area": "", "supplementary_material": "", "author": "Zachary C. Lipton;Kamyar Azizzadenesheli;Abhishek Kumar;Lihong Li;Jianfeng Gao;Li Deng", "authorids": "zlipton@cmu.edu;kazizzad@uci.edu;abkumar@ucsd.edu;lihongli.cs@gmail.com;jfgao@microsoft.com;l.deng@ieee.org", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nc.2018avoiding,\ntitle={Avoiding Catastrophic States with Intrinsic Fear},\nauthor={Zachary C. Lipton and Kamyar Azizzadenesheli and Abhishek Kumar and Lihong Li and Jianfeng Gao and Li Deng},\nyear={2018},\nurl={https://openreview.net/forum?id=B16yEqkCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B16yEqkCZ", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;5;3", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 17, "authors#_avg": 6, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16457665565452188398&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Emergence of grid-like representations by training recurrent neural networks to perform spatial localization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/245", "id": "B17JTOe0-", "author_site": "Christopher Cueva, Xue-Xin Wei", "tldr": "To our knowledge, this is the first study to show how neural representations of space, including grid-like cells and border cells as observed in the brain, could emerge from training a recurrent neural network to perform navigation tasks.", "abstract": "Decades of research on the neural code underlying spatial navigation have revealed a diverse set of neural response properties. The Entorhinal Cortex (EC) of the mammalian brain contains a rich set of spatial correlates, including grid cells which encode space using tessellating patterns. However, the mechanisms and functional significance of these spatial representations remain largely mysterious. As a new way to understand these neural representations, we trained recurrent neural networks (RNNs) to perform navigation tasks in 2D arenas based on velocity inputs. Surprisingly, we find that grid-like spatial response patterns emerge in trained networks, along with units that exhibit other spatial correlates, including border cells and band-like cells. All these different functional types of neurons have been observed experimentally. The order of the emergence of grid-like and border cells is also consistent with observations from developmental studies. Together, our results suggest that grid cells, border cells and others as observed in EC may be a natural solution for representing space efficiently given the predominant recurrent connections in the neural circuits.\n", "keywords": "recurrent neural network;grid cell;neural representation of space", "primary_area": "", "supplementary_material": "", "author": "Christopher J. Cueva;Xue-Xin Wei", "authorids": "ccueva@gmail.com;weixxpku@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nj.2018emergence,\ntitle={Emergence of grid-like representations by training recurrent neural networks to perform spatial localization},\nauthor={Christopher J. Cueva and Xue-Xin Wei},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B17JTOe0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "8;8;9", "confidence": "4;4;4", "rating_avg": 8.333333333333334, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 257, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17982363881710964955&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=B17JTOe0-", "pdf": "https://openreview.net/pdf?id=B17JTOe0-", "email": ";", "author_num": 2 }, { "title": "Learning General Purpose Distributed Sentence Representations via Large Scale Multi-task Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/99", "id": "B18WgG-CZ", "author_site": "Sandeep Subramanian, Adam Trischler, Yoshua Bengio, Christopher Pal", "tldr": "A large-scale multi-task learning framework with diverse training objectives to learn fixed-length sentence representations", "abstract": "A lot of the recent success in natural language processing (NLP) has been driven by distributed vector representations of words trained on large amounts of text in an unsupervised manner. These representations are typically used as general purpose features for words across a range of NLP problems. However, extending this success to learning representations of sequences of words, such as sentences, remains an open problem. Recent work has explored unsupervised as well as supervised learning techniques with different training objectives to learn general purpose fixed-length sentence representations. In this work, we present a simple, effective multi-task learning framework for sentence representations that combines the inductive biases of diverse training objectives in a single model. \nWe train this model on several data sources with multiple training objectives on over 100 million sentences. Extensive experiments demonstrate that sharing a single recurrent sentence encoder across weakly related tasks leads to consistent improvements over previous methods. We present substantial improvements in the context of transfer learning and low-resource settings using our learned general-purpose representations.", "keywords": "distributed sentence representations;multi-task learning", "primary_area": "", "supplementary_material": "", "author": "Sandeep Subramanian;Adam Trischler;Yoshua Bengio;Christopher J Pal", "authorids": "sandeep.subramanian.1@umontreal.ca;adam.trischler@microsoft.com;yoshua.umontreal@gmail.com;christopher.pal@polymtl.ca", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nsubramanian2018learning,\ntitle={Learning General Purpose Distributed Sentence Representations via Large Scale Multi-task Learning},\nauthor={Sandeep Subramanian and Adam Trischler and Yoshua Bengio and Christopher J Pal},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B18WgG-CZ},\n}", "github": "[![github](/images/github_icon.svg) facebookresearch/SentEval](https://github.com/facebookresearch/SentEval) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=B18WgG-CZ)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "4;8;8", "confidence": "5;5;5", "rating_avg": 6.666666666666667, "confidence_avg": 5.0, "replies_avg": 20, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 414, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14356338572448823379&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=B18WgG-CZ", "pdf": "https://openreview.net/pdf?id=B18WgG-CZ", "email": ";;;", "author_num": 4 }, { "id": "B1CEaMbR-", "title": "Clustering with Deep Learning: Taxonomy and New Methods", "track": "main", "status": "Reject", "tldr": "Unifying framework to perform clustering using deep neural networks", "abstract": "Clustering is a fundamental machine learning method. The quality of its results is dependent on the data distribution. For this reason, deep neural networks can be used for learning better representations of the data. In this paper, we propose a systematic taxonomy for clustering with deep learning, in addition to a review of methods from the field. Based on our taxonomy, creating new methods is more straightforward. We also propose a new approach which is built on the taxonomy and surpasses some of the limitations of some previous work. Our experimental evaluation on image datasets shows that the method approaches state-of-the-art clustering quality, and performs better in some cases.", "keywords": "clustering;deep learning;neural networks", "primary_area": "", "supplementary_material": "", "author": "Elie Aljalbout;Vladimir Golkov;Yawar Siddiqui;Daniel Cremers", "authorids": "elie.aljalbout@tum.de;vladimir.golkov@tum.de;yawar.siddiqui@tum.de;cremers@tum.de", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\naljalbout2018clustering,\ntitle={Clustering with Deep Learning: Taxonomy and New Methods},\nauthor={Elie Aljalbout and Vladimir Golkov and Yawar Siddiqui and Daniel Cremers},\nyear={2018},\nurl={https://openreview.net/forum?id=B1CEaMbR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1CEaMbR-", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;5;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.666666666666667, "replies_avg": 6, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 357, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18031601999521682004&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "B1CNpYg0-", "title": "Learning to Compute Word Embeddings On the Fly", "track": "main", "status": "Reject", "tldr": "We propose a method to deal with rare words by computing their embedding from definitions.", "abstract": "Words in natural language follow a Zipfian distribution whereby some words are frequent but most are rare. Learning representations for words in the ``long tail'' of this distribution requires enormous amounts of data. \nRepresentations of rare words trained directly on end tasks are usually poor, requiring us to pre-train embeddings on external data, or treat all rare words as out-of-vocabulary words with a unique representation. We provide a method for predicting embeddings of rare words on the fly from small amounts of auxiliary data with a network trained end-to-end for the downstream task. We show that this improves results against baselines where embeddings are trained on the end task for reading comprehension, recognizing textual entailment and language modeling.\n", "keywords": "NLU;word embeddings;representation learning", "primary_area": "", "supplementary_material": "", "author": "Dzmitry Bahdanau;Tom Bosc;Stanis\u0142aw Jastrz\u0119bski;Edward Grefenstette;Pascal Vincent;Yoshua Bengio", "authorids": "dimabgv@gmail.com;bosc.tom@gmail.com;staszek.jastrzebski@gmail.com;etg@google.com;pascal.vincent@umontreal.ca;yoshua.umontreal@gmail.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nbahdanau2018learning,\ntitle={Learning to Compute Word Embeddings On the Fly},\nauthor={Dzmitry Bahdanau and Tom Bosc and Stanis\u0142aw Jastrz\u0119bski and Edward Grefenstette and Pascal Vincent and Yoshua Bengio},\nyear={2018},\nurl={https://openreview.net/forum?id=B1CNpYg0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1CNpYg0-", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;4;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 103, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12117150033809186035&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "B1CQGfZ0b", "title": "Learning to select examples for program synthesis", "track": "main", "status": "Reject", "tldr": "In a program synthesis context where the input is a set of examples, we reduce the cost by computing a subset of representative examples", "abstract": "Program synthesis is a class of regression problems where one seeks a solution, in the form of a source-code program, that maps the inputs to their corresponding outputs exactly. Due to its precise and combinatorial nature, it is commonly formulated as a constraint satisfaction problem, where input-output examples are expressed constraints, and solved with a constraint solver. A key challenge of this formulation is that of scalability: While constraint solvers work well with few well-chosen examples, constraining the entire set of example constitutes a significant overhead in both time and memory. In this paper we address this challenge by constructing a representative subset of examples that is both small and is able to constrain the solver sufficiently. We build the subset one example at a time, using a trained discriminator to predict the probability of unchosen input-output examples conditioned on the chosen input-output examples, adding the least probable example to the subset. Experiment on a diagram drawing domain shows our approach produces subset of examples that are small and representative for the constraint solver.", "keywords": "program synthesis;program induction;example selection", "primary_area": "", "supplementary_material": "", "author": "Yewen Pu;Zachery Miranda;Armando Solar-Lezama;Leslie Pack Kaelbling", "authorids": "yewenpu@mit.edu;zmiranda@mit.edu;asolar@csail.mit.edu;lpk@csail.mit.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\npu2018learning,\ntitle={Learning to select examples for program synthesis},\nauthor={Yewen Pu and Zachery Miranda and Armando Solar-Lezama and Leslie Pack Kaelbling},\nyear={2018},\nurl={https://openreview.net/forum?id=B1CQGfZ0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1CQGfZ0b", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;3;4", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16167696538327647352&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "B1D6ty-A-", "title": "Training Autoencoders by Alternating Minimization", "track": "main", "status": "Reject", "tldr": "We utilize the alternating minimization principle to provide an effective novel technique to train deep autoencoders.", "abstract": "We present DANTE, a novel method for training neural networks, in particular autoencoders, using the alternating minimization principle. DANTE provides a distinct perspective in lieu of traditional gradient-based backpropagation techniques commonly used to train deep networks. It utilizes an adaptation of quasi-convex optimization techniques to cast autoencoder training as a bi-quasi-convex optimization problem. We show that for autoencoder configurations with both differentiable (e.g. sigmoid) and non-differentiable (e.g. ReLU) activation functions, we can perform the alternations very effectively. DANTE effortlessly extends to networks with multiple hidden layers and varying network configurations. In experiments on standard datasets, autoencoders trained using the proposed method were found to be very promising when compared to those trained using traditional backpropagation techniques, both in terms of training speed, as well as feature extraction and reconstruction performance.", "keywords": "Deep Learning;Autoencoders;Alternating Optimization", "primary_area": "", "supplementary_material": "", "author": "Sneha Kudugunta;Adepu Shankar;Surya Chavali;Vineeth Balasubramanian;Purushottam Kar", "authorids": "cs14btech11020@iith.ac.in;cs14resch11001@iith.ac.in;cs13b1028@iith.ac.in;vineethnb@iith.ac.in;purushot@cse.iitk.ac.in", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nkudugunta2018training,\ntitle={Training Autoencoders by Alternating Minimization},\nauthor={Sneha Kudugunta and Adepu Shankar and Surya Chavali and Vineeth Balasubramanian and Purushottam Kar},\nyear={2018},\nurl={https://openreview.net/forum?id=B1D6ty-A-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1D6ty-A-", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;4;5", "rating_avg": 5.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 5, "corr_rating_confidence": 0.7559289460184544, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YLrBx8h7L10J:scholar.google.com/&scioq=Training+Autoencoders+by+Alternating+Minimization&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "A Simple Neural Attentive Meta-Learner", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/64", "id": "B1DmUzWAW", "author_site": "Nikhil Mishra, Mostafa Rohaninejad, Xi Chen, Pieter Abbeel", "tldr": "a simple RNN-based meta-learner that achieves SOTA performance on popular benchmarks", "abstract": "Deep neural networks excel in regimes with large amounts of data, but tend to struggle when data is scarce or when they need to adapt quickly to changes in the task. In response, recent work in meta-learning proposes training a meta-learner on a distribution of similar tasks, in the hopes of generalization to novel but related tasks by learning a high-level strategy that captures the essence of the problem it is asked to solve. However, many recent meta-learning approaches are extensively hand-designed, either using architectures specialized to a particular application, or hard-coding algorithmic components that constrain how the meta-learner solves the task. We propose a class of simple and generic meta-learner architectures that use a novel combination of temporal convolutions and soft attention; the former to aggregate information from past experience and the latter to pinpoint specific pieces of information. In the most extensive set of meta-learning experiments to date, we evaluate the resulting Simple Neural AttentIve Learner (or SNAIL) on several heavily-benchmarked tasks. On all tasks, in both supervised and reinforcement learning, SNAIL attains state-of-the-art performance by significant margins.", "keywords": "meta-learning;few-shot learning", "primary_area": "", "supplementary_material": "", "author": "Nikhil Mishra;Mostafa Rohaninejad;Xi Chen;Pieter Abbeel", "authorids": "nmishra@berkeley.edu;rohaninejadm@berkeley.edu;adslcx@gmail.com;pabbeel@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nmishra2018a,\ntitle={A Simple Neural Attentive Meta-Learner},\nauthor={Nikhil Mishra and Mostafa Rohaninejad and Xi Chen and Pieter Abbeel},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1DmUzWAW},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 4 community implementations](https://paperswithcode.com/paper/?openreview=B1DmUzWAW)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;3;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 1683, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13155723657744889520&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=B1DmUzWAW", "pdf": "https://openreview.net/pdf?id=B1DmUzWAW", "email": ";;;", "author_num": 4 }, { "title": "Deep Neural Networks as Gaussian Processes", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/91", "id": "B1EA-M-0Z", "author_site": "Jaehoon Lee, Yasaman Bahri, Roman Novak, Samuel Schoenholz, Jeffrey Pennington, Jascha Sohl-Dickstein", "tldr": "We show how to make predictions using deep networks, without training deep networks.", "abstract": "It has long been known that a single-layer fully-connected neural network with an i.i.d. prior over its parameters is equivalent to a Gaussian process (GP), in the limit of infinite network width. This correspondence enables exact Bayesian inference for infinite width neural networks on regression tasks by means of evaluating the corresponding GP. Recently, kernel functions which mimic multi-layer random neural networks have been developed, but only outside of a Bayesian framework. As such, previous work has not identified that these kernels can be used as covariance functions for GPs and allow fully Bayesian prediction with a deep neural network.\n\nIn this work, we derive the exact equivalence between infinitely wide, deep, networks and GPs with a particular covariance function. We further develop a computationally efficient pipeline to compute this covariance function. We then use the resulting GP to perform Bayesian inference for deep neural networks on MNIST and CIFAR-10. We observe that the trained neural network accuracy approaches that of the corresponding GP with increasing layer width, and that the GP uncertainty is strongly correlated with trained network prediction error. We further find that test performance increases as finite-width trained networks are made wider and more similar to a GP, and that the GP-based predictions typically outperform those of finite-width networks. Finally we connect the prior distribution over weights and variances in our GP formulation to the recent development of signal propagation in random neural networks.", "keywords": "Gaussian process;Bayesian regression;deep networks;kernel methods", "primary_area": "", "supplementary_material": "", "author": "Jaehoon Lee;Yasaman Bahri;Roman Novak;Samuel S. Schoenholz;Jeffrey Pennington;Jascha Sohl-Dickstein", "authorids": "jaehlee@google.com;yasamanb@google.com;romann@google.com;schsam@google.com;jpennin@google.com;jaschasd@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nlee2018deep,\ntitle={Deep Neural Networks as Gaussian Processes},\nauthor={Jaehoon Lee and Jascha Sohl-dickstein and Jeffrey Pennington and Roman Novak and Sam Schoenholz and Yasaman Bahri},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1EA-M-0Z},\n}", "github": "[![github](/images/github_icon.svg) brain-research/nngp](https://github.com/brain-research/nngp) + [![Papers with Code](/images/pwc_icon.svg) 6 community implementations](https://paperswithcode.com/paper/?openreview=B1EA-M-0Z)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;4;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 6, "corr_rating_confidence": -0.7559289460184545, "gs_citation": 1424, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6709509064500094656&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=B1EA-M-0Z", "pdf": "https://openreview.net/pdf?id=B1EA-M-0Z", "email": ";;;;;", "author_num": 6 }, { "id": "B1EGg7ZCb", "title": "Autonomous Vehicle Fleet Coordination With Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "Utilized Deep Reinforcement Learning to teach agents ride-sharing fleet style coordination.", "abstract": "Autonomous vehicles are becoming more common in city transportation. Companies will begin to find a need to teach these vehicles smart city fleet coordination. Currently, simulation based modeling along with hand coded rules dictate the decision making of these autonomous vehicles. We believe that complex intelligent behavior can be learned by these agents through Reinforcement Learning.In this paper, we discuss our work for solving this system by adapting the Deep Q-Learning (DQN) model to the multi-agent setting. Our approach applies deep reinforcement learning by combining convolutional neural networks with DQN to teach agents to fulfill customer demand in an environment that is partially observ-able to them. We also demonstrate how to utilize transfer learning to teach agents to balance multiple objectives such as navigating to a charging station when its en-ergy level is low. The two evaluations presented show that our solution has shown hat we are successfully able to teach agents cooperation policies while balancing multiple objectives.", "keywords": "Deep Reinforcement Learning;mult-agent systems", "primary_area": "", "supplementary_material": "", "author": "Cane Punma", "authorids": "cane.cane@live.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\npunma2018autonomous,\ntitle={Autonomous Vehicle Fleet Coordination With Deep Reinforcement Learning},\nauthor={Cane Punma},\nyear={2018},\nurl={https://openreview.net/forum?id=B1EGg7ZCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1EGg7ZCb", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;3;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 1, "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8699515086735085895&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "B1EPYJ-C-", "title": "Federated Learning: Strategies for Improving Communication Efficiency", "track": "main", "status": "Reject", "tldr": "", "abstract": "Federated Learning is a machine learning setting where the goal is to train a high-quality centralized model while training data remains distributed over a large number of clients each with unreliable and relatively slow network connections. We consider learning algorithms for this setting where on each round, each client independently computes an update to the current model based on its local data, and communicates this update to a central server, where the client-side updates are aggregated to compute a new global model. The typical clients in this setting are mobile phones, and communication efficiency is of the utmost importance. \n\nIn this paper, we propose two ways to reduce the uplink communication costs: structured updates, where we directly learn an update from a restricted space parametrized using a smaller number of variables, e.g. either low-rank or a random mask; and sketched updates, where we learn a full model update and then compress it using a combination of quantization, random rotations, and subsampling before sending it to the server. Experiments on both convolutional and recurrent networks show that the proposed methods can reduce the communication cost by two orders of magnitude.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jakub Kone\u010dn\u00fd;H. Brendan McMahan;Felix X. Yu;Ananda Theertha Suresh;Dave Bacon;Peter Richt\u00e1rik", "authorids": "konkey@google.com;mcmahan@google.com;felixyu@google.com;theertha@google.com;dabacon@google.com;peter.richtarik@kaust.edu.sa", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nkone\u010dn\u00fd2018federated,\ntitle={Federated Learning: Strategies for Improving Communication Efficiency},\nauthor={Jakub Kone\u010dn\u00fd and H. Brendan McMahan and Felix X. Yu and Ananda Theertha Suresh and Dave Bacon and Peter Richt\u00e1rik},\nyear={2018},\nurl={https://openreview.net/forum?id=B1EPYJ-C-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1EPYJ-C-", "pdf_size": 0, "rating": "5;5;7", "confidence": "3;5;5", "rating_avg": 5.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 6, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 6262, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3080297582660271776&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11 }, { "id": "B1EVwkqTW", "title": "Make SVM great again with Siamese kernel for few-shot learning", "track": "main", "status": "Reject", "tldr": "The proposed method is an end-to-end neural SVM, which is optimized for few-shot learning.", "abstract": "While deep neural networks have shown outstanding results in a wide range of applications,\nlearning from a very limited number of examples is still a challenging\ntask. Despite the difficulties of the few-shot learning, metric-learning techniques\nshowed the potential of the neural networks for this task. While these methods\nperform well, they don\u2019t provide satisfactory results. In this work, the idea of\nmetric-learning is extended with Support Vector Machines (SVM) working mechanism,\nwhich is well known for generalization capabilities on a small dataset.\nFurthermore, this paper presents an end-to-end learning framework for training\nadaptive kernel SVMs, which eliminates the problem of choosing a correct kernel\nand good features for SVMs. Next, the one-shot learning problem is redefined\nfor audio signals. Then the model was tested on vision task (using Omniglot\ndataset) and speech task (using TIMIT dataset) as well. Actually, the algorithm\nusing Omniglot dataset improved accuracy from 98.1% to 98.5% on the one-shot\nclassification task and from 98.9% to 99.3% on the few-shot classification task.", "keywords": "SVM;siamese network;one-shot learning;few-shot learning", "primary_area": "", "supplementary_material": "", "author": "Bence Tilk", "authorids": "bence.tilk@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\ntilk2018make,\ntitle={Make {SVM} great again with Siamese kernel for few-shot learning},\nauthor={Bence Tilk},\nyear={2018},\nurl={https://openreview.net/forum?id=B1EVwkqTW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1EVwkqTW", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;5;4", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 1, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18367416134098105335&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "B1G6uM0WG", "title": "Tactical Decision Making for Lane Changing with Deep Reinforcement Learning", "track": "main", "status": "Withdraw", "tldr": "A framework that provides a policy for autonomous lane changing by learning to make high-level tactical decisions with deep reinforcement learning, and maintaining a tight integration with a low-level controller to take low-level actions.", "abstract": "In this paper, we consider the problem of autonomous lane changing for self driving vehicles in a multi-lane, multi-agent setting. We present a framework that demonstrates a more structured and data efficient alternative to end-to-end complete policy learning on problems where the high-level policy is hard to formulate using traditional optimization or rule based methods but well designed low-level controllers are available. Our framework uses deep reinforcement learning solely to obtain a high-level policy for tactical decision making, while still maintaining a tight integration with the low-level controller, thus getting the best of both worlds. We accomplish this with Q-masking, a technique with which we are able to incorporate prior knowledge, constraints, and information from a low-level controller, directly in to the learning process thereby simplifying the reward function and making learning faster and data efficient. We provide preliminary results in a simulator and show our approach to be more efficient than a greedy baseline, and more successful and safer than human driving.", "keywords": "autonomous lane changing;decision making;deep reinforcement learning;q-learning", "primary_area": "", "supplementary_material": "", "author": "Mustafa Mukadam;Akansel Cosgun;Alireza Nakhaei;Kikuo Fujimura", "authorids": "mmukadam3@gatech.edu;acosgun@hra.com;anakhaei@hra.com;kfujimura@hra.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1G6uM0WG", "pdf_size": 0, "rating": "3;3;3", "confidence": "5;4;5", "rating_avg": 3.0, "confidence_avg": 4.666666666666667, "replies_avg": 3, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 128, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3978845440526580415&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "title": "Learning from Between-class Examples for Deep Sound Recognition", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/259", "id": "B1Gi6LeRZ", "author_site": "Yuji Tokozume, Yoshitaka Ushiku, Tatsuya Harada", "tldr": "We propose an novel learning method for deep sound recognition named BC learning.", "abstract": "Deep learning methods have achieved high performance in sound recognition tasks. Deciding how to feed the training data is important for further performance improvement. We propose a novel learning method for deep sound recognition: Between-Class learning (BC learning). Our strategy is to learn a discriminative feature space by recognizing the between-class sounds as between-class sounds. We generate between-class sounds by mixing two sounds belonging to different classes with a random ratio. We then input the mixed sound to the model and train the model to output the mixing ratio. The advantages of BC learning are not limited only to the increase in variation of the training data; BC learning leads to an enlargement of Fisher\u2019s criterion in the feature space and a regularization of the positional relationship among the feature distributions of the classes. The experimental results show that BC learning improves the performance on various sound recognition networks, datasets, and data augmentation schemes, in which BC learning proves to be always beneficial. Furthermore, we construct a new deep sound recognition network (EnvNet-v2) and train it with BC learning. As a result, we achieved a performance surpasses the human level.", "keywords": "sound recognition;supervised learning;feature learning", "primary_area": "", "supplementary_material": "", "author": "Yuji Tokozume;Yoshitaka Ushiku;Tatsuya Harada", "authorids": "tokozume@mi.t.u-tokyo.ac.jp;ushiku@mi.t.u-tokyo.ac.jp;harada@mi.t.u-tokyo.ac.jp", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ntokozume2018learning,\ntitle={Learning from Between-class Examples for Deep Sound Recognition},\nauthor={Yuji Tokozume and Yoshitaka Ushiku and Tatsuya Harada},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1Gi6LeRZ},\n}", "github": "[![github](/images/github_icon.svg) mil-tokyo/bc_learning_sound](https://github.com/mil-tokyo/bc_learning_sound) + [![Papers with Code](/images/pwc_icon.svg) 4 community implementations](https://paperswithcode.com/paper/?openreview=B1Gi6LeRZ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "4;8;9", "confidence": "4;4;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 15, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 326, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13221046760066147561&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=B1Gi6LeRZ", "pdf": "https://openreview.net/pdf?id=B1Gi6LeRZ", "email": ";;", "author_num": 3 }, { "title": "The High-Dimensional Geometry of Binary Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/244", "id": "B1IDRdeCW", "author_site": "Alexander Anderson, Cory P Berg", "tldr": "Recent successes of Binary Neural Networks can be understood based on the geometry of high-dimensional binary vectors", "abstract": "Recent research has shown that one can train a neural network with binary weights and activations at train time by augmenting the weights with a high-precision continuous latent variable that accumulates small changes from stochastic gradient descent. However, there is a dearth of work to explain why one can effectively capture the features in data with binary weights and activations. Our main result is that the neural networks with binary weights and activations trained using the method of Courbariaux, Hubara et al. (2016) work because of the high-dimensional geometry of binary vectors. In particular, the ideal continuous vectors that extract out features in the intermediate representations of these BNNs are well-approximated by binary vectors in the sense that dot products are approximately preserved. Compared to previous research that demonstrated good classification performance with BNNs, our work explains why these BNNs work in terms of HD geometry. Furthermore, the results and analysis used on BNNs are shown to generalize to neural networks with ternary weights and activations. Our theory serves as a foundation for understanding not only BNNs but a variety of methods that seek to compress traditional neural networks. Furthermore, a better understanding of multilayer binary neural networks serves as a starting point for generalizing BNNs to other neural network architectures such as recurrent neural networks.", "keywords": "Binary Neural Networks;Neural Network Visualization", "primary_area": "", "supplementary_material": "", "author": "Alexander G. Anderson;Cory P. Berg", "authorids": "aga@berkeley.edu;cberg500@berkeley.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\ng.2018the,\ntitle={The High-Dimensional Geometry of Binary Neural Networks},\nauthor={Alexander G. Anderson and Cory P. Berg},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1IDRdeCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "4;7;7", "confidence": "3;4;4", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": 1.0, "gs_citation": 101, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16436940415078137739&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=B1IDRdeCW", "pdf": "https://openreview.net/pdf?id=B1IDRdeCW", "email": ";", "author_num": 2 }, { "title": "Understanding Deep Neural Networks with Rectified Linear Units", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/155", "id": "B1J_rgWRW", "author_site": "Raman Arora, Amitabh Basu, Poorya Mianjy, Anirbit Mukherjee", "tldr": "This paper 1) characterizes functions representable by ReLU DNNs, 2) formally studies the benefit of depth in such architectures, 3) gives an algorithm to implement empirical risk minimization to global optimality for two layer ReLU nets.", "abstract": "In this paper we investigate the family of functions representable by deep neural networks (DNN) with rectified linear units (ReLU). We give an algorithm to train a ReLU DNN with one hidden layer to {\\em global optimality} with runtime polynomial in the data size albeit exponential in the input dimension. Further, we improve on the known lower bounds on size (from exponential to super exponential) for approximating a ReLU deep net function by a shallower ReLU net. Our gap theorems hold for smoothly parametrized families of ``hard'' functions, contrary to countable, discrete families known in the literature. An example consequence of our gap theorems is the following: for every natural number $k$ there exists a function representable by a ReLU DNN with $k^2$ hidden layers and total size $k^3$, such that any ReLU DNN with at most $k$ hidden layers will require at least $\\frac12k^{k+1}-1$ total nodes. Finally, for the family of $\\R^n\\to \\R$ DNNs with ReLU activations, we show a new lowerbound on the number of affine pieces, which is larger than previous constructions in certain regimes of the network architecture and most distinctively our lowerbound is demonstrated by an explicit construction of a \\emph{smoothly parameterized} family of functions attaining this scaling. Our construction utilizes the theory of zonotopes from polyhedral theory.", "keywords": "expressive power;benefits of depth;empirical risk minimization;global optimality;computational hardness;combinatorial optimization", "primary_area": "", "supplementary_material": "", "author": "Raman Arora;Amitabh Basu;Poorya Mianjy;Anirbit Mukherjee", "authorids": "arora@cs.jhu.edu;basu.amitabh@jhu.edu;mianjy@jhu.edu;amukhe14@jhu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\narora2018understanding,\ntitle={Understanding Deep Neural Networks with Rectified Linear Units},\nauthor={Raman Arora and Amitabh Basu and Poorya Mianjy and Anirbit Mukherjee},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1J_rgWRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;5;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 864, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14379463550186291661&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=B1J_rgWRW", "pdf": "https://openreview.net/pdf?id=B1J_rgWRW", "email": ";;;", "author_num": 4 }, { "id": "B1KFAGWAZ", "title": "Revisiting The Master-Slave Architecture In Multi-Agent Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "We revisit the idea of the master-slave architecture in multi-agent deep reinforcement learning and outperforms state-of-the-arts.", "abstract": "Many tasks in artificial intelligence require the collaboration of multiple agents. We exam deep reinforcement learning for multi-agent domains. Recent research efforts often take the form of two seemingly conflicting perspectives, the decentralized perspective, where each agent is supposed to have its own controller; and the centralized perspective, where one assumes there is a larger model controlling all agents. In this regard, we revisit the idea of the master-slave architecture by incorporating both perspectives within one framework. Such a hierarchical structure naturally leverages advantages from one another. The idea of combining both perspective is intuitive and can be well motivated from many real world systems, however, out of a variety of possible realizations, we highlights three key ingredients, i.e. composed action representation, learnable communication and independent reasoning. With network designs to facilitate these explicitly, our proposal consistently outperforms latest competing methods both in synthetics experiments and when applied to challenging StarCraft micromanagement tasks.", "keywords": "Deep Reinforcement Learning;Multi-Agent Reinforcement Learning;StarCraft Micromanagement Tasks", "primary_area": "", "supplementary_material": "", "author": "Xiangyu Kong;Fangchen Liu;Bo Xin;Yizhou Wang", "authorids": "kxyzc1992@gmail.com;liufangchen@pku.edu.cn;boxin@microsoft.com;yizhou.wang@pku.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nkong2018revisiting,\ntitle={Revisiting The Master-Slave Architecture In Multi-Agent Deep Reinforcement Learning},\nauthor={Xiangyu Kong and Fangchen Liu and Bo Xin and Yizhou Wang},\nyear={2018},\nurl={https://openreview.net/forum?id=B1KFAGWAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1KFAGWAZ", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;4;5", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 63, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13136873656008448410&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "B1KJJf-R-", "title": "Neural Program Search: Solving Data Processing Tasks from Description and Examples", "track": "main", "status": "Workshop", "tldr": "Program synthesis from natural language description and input / output examples via Tree-Beam Search over Seq2Tree model", "abstract": "We present a Neural Program Search, an algorithm to generate programs from natural language description and a small number of input / output examples. The algorithm combines methods from Deep Learning and Program Synthesis fields by designing rich domain-specific language (DSL) and defining efficient search algorithm guided by a Seq2Tree model on it. To evaluate the quality of the approach we also present a semi-synthetic dataset of descriptions with test examples and corresponding programs. We show that our algorithm significantly outperforms sequence-to-sequence model with attention baseline.", "keywords": "Deep learning;Structured Prediction;Natural Language Processing;Neural Program Synthesis", "primary_area": "", "supplementary_material": "", "author": "Illia Polosukhin;Alexander Skidanov", "authorids": "illia@near.ai;alex@near.ai", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\npolosukhin2018neural,\ntitle={Neural Program Search: Solving Data Processing Tasks from Description and Examples},\nauthor={Illia Polosukhin and Alexander Skidanov},\nyear={2018},\nurl={https://openreview.net/forum?id=B1KJJf-R-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=B1KJJf-R-", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12032422523523494390&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Deep Learning as a Mixed Convex-Combinatorial Optimization Problem", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/92", "id": "B1Lc-Gb0Z", "author_site": "Abram Friesen, Pedro Domingos", "tldr": "We learn deep networks of hard-threshold units by setting hidden-unit targets using combinatorial optimization and weights by convex optimization, resulting in improved performance on ImageNet.", "abstract": "As neural networks grow deeper and wider, learning networks with hard-threshold activations is becoming increasingly important, both for network quantization, which can drastically reduce time and energy requirements, and for creating large integrated systems of deep networks, which may have non-differentiable components and must avoid vanishing and exploding gradients for effective learning. However, since gradient descent is not applicable to hard-threshold functions, it is not clear how to learn them in a principled way. We address this problem by observing that setting targets for hard-threshold hidden units in order to minimize loss is a discrete optimization problem, and can be solved as such. The discrete optimization goal is to find a set of targets such that each unit, including the output, has a linearly separable problem to solve. Given these targets, the network decomposes into individual perceptrons, which can then be learned with standard convex approaches. Based on this, we develop a recursive mini-batch algorithm for learning deep hard-threshold networks that includes the popular but poorly justified straight-through estimator as a special case. Empirically, we show that our algorithm improves classification accuracy in a number of settings, including for AlexNet and ResNet-18 on ImageNet, when compared to the straight-through estimator.", "keywords": "hard-threshold units;combinatorial optimization;target propagation;straight-through estimation;quantization", "primary_area": "", "supplementary_material": "", "author": "Abram L. Friesen;Pedro Domingos", "authorids": "afriesen@cs.washington.edu;pedrod@cs.washington.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nl.2018deep,\ntitle={Deep Learning as a Mixed Convex-Combinatorial Optimization Problem},\nauthor={Abram L. Friesen and Pedro Domingos},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1Lc-Gb0Z},\n}", "github": "[![github](/images/github_icon.svg) afriesen/ftprop](https://github.com/afriesen/ftprop)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;4;4", "rating_avg": 7.0, "confidence_avg": 3.6666666666666665, "replies_avg": 6, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14079107033151501838&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=B1Lc-Gb0Z", "pdf": "https://openreview.net/pdf?id=B1Lc-Gb0Z", "email": ";", "author_num": 2 }, { "id": "B1NGT8xCZ", "title": "Principled Hybrids of Generative and Discriminative Domain Adaptation", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a probabilistic framework for domain adaptation that blends both generative and discriminative modeling in a principled way. Under this framework, generative and discriminative models correspond to specific choices of the prior over parameters. This provides us a very general way to interpolate between generative and discriminative extremes through different choices of priors. By maximizing both the marginal and the conditional log-likelihoods, models derived from this framework can use both labeled instances from the source domain as well as unlabeled instances from \\emph{both} source and target domains. Under this framework, we show that the popular reconstruction loss of autoencoder corresponds to an upper bound of the negative marginal log-likelihoods of unlabeled instances, where marginal distributions are given by proper kernel density estimations. This provides a way to interpret the empirical success of autoencoders in domain adaptation and semi-supervised learning. We instantiate our framework using neural networks, and build a concrete model, \\emph{DAuto}. Empirically, we demonstrate the effectiveness of DAuto on text, image and speech datasets, showing that it outperforms related competitors when domain adaptation is possible.\n", "keywords": "domain adaptation;neural networks;generative models;discriminative models", "primary_area": "", "supplementary_material": "", "author": "Han Zhao;Zhenyao Zhu;Junjie Hu;Adam Coates;Geoff Gordon", "authorids": "han.zhao@cs.cmu.edu;zhenyaozhu@baidu.com;junjieh@cmu.edu;adamcoates@baidu.com;ggordon@cs.cmu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nzhao2018principled,\ntitle={Principled Hybrids of Generative and Discriminative Domain Adaptation},\nauthor={Han Zhao and Zhenyao Zhu and Junjie Hu and Adam Coates and Geoff Gordon},\nyear={2018},\nurl={https://openreview.net/forum?id=B1NGT8xCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1NGT8xCZ", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 5, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=564536607600364565&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3 }, { "id": "B1NKuC6SG", "title": "Language Style Transfer from Non-Parallel Text with Arbitrary Styles", "track": "main", "status": "Withdraw", "tldr": "We present an encoder-decoder framework for language style transfer, which allows for the use of non-parallel data and source data with various unknown language styles.", "abstract": "Language style transfer is the problem of migrating the content of a source sentence to a target style. In many applications, parallel training data are not available and source sentences to be transferred may have arbitrary and unknown styles. In this paper, we present an encoder-decoder framework under this problem setting. Each sentence is encoded into its content and style latent representations. By recombining the content with the target style, we can decode a sentence aligned in the target domain. To adequately constrain the encoding and decoding functions, we couple them with two loss functions. The first is a style discrepancy loss, enforcing that the style representation accurately encodes the style information guided by the discrepancy between the sentence style and the target style. The second is a cycle consistency loss, which ensures that the transferred sentence should preserve the content of the original sentence disentangled from its style. We validate the effectiveness of our proposed model on two tasks: sentiment modification of restaurant reviews, and dialog response revision with a romantic style.", "keywords": "style transfer;text generation;non-parallel data", "primary_area": "", "supplementary_material": "", "author": "Yanpeng Zhao;Victoria W. Bi;Deng Cai;Xiaojiang Liu;Kewei Tu;Shuming Shi", "authorids": "zhaoyp1@shanghaitech.edu.cn;victoriabi@tencent.com;thisisjcykcd@gmail.com;kieranliu@tencent.com;tukw@shanghaitech.edu.cn;shumingshi@tencent.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@article{\nzhao2018language,\ntitle={Language Style Transfer from Non-Parallel Text with Arbitrary Styles},\nauthor={Yanpeng Zhao, Victoria W. Bi, Deng Cai, Xiaojiang Liu, Kewei Tu, Shuming Shi},\njournal={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HJLPel-CW},\nnote={rejected}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=B1NKuC6SG", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 0, "authors#_avg": 6, "corr_rating_confidence": 0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16829403004742480923&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "B1NOXfWR-", "title": "Neural Task Graph Execution", "track": "main", "status": "Reject", "tldr": "", "abstract": "In order to develop a scalable multi-task reinforcement learning (RL) agent that is able to execute many complex tasks, this paper introduces a new RL problem where the agent is required to execute a given task graph which describes a set of subtasks and dependencies among them. Unlike existing approaches which explicitly describe what the agent should do, our problem only describes properties of subtasks and relationships between them, which requires the agent to perform a complex reasoning to find the optimal subtask to execute. To solve this problem, we propose a neural task graph solver (NTS) which encodes the task graph using a recursive neural network. To overcome the difficulty of training, we propose a novel non-parametric gradient-based policy that performs back-propagation over a differentiable form of the task graph to compute the influence of each subtask on the other subtasks. Our NTS is pre-trained to approximate the proposed gradient-based policy and fine-tuned through actor-critic method. The experimental results on a 2D visual domain show that our method to pre-train from the gradient-based policy significantly improves the performance of NTS. We also demonstrate that our agent can perform a complex reasoning to find the optimal way of executing the task graph and generalize well to unseen task graphs. In addition, we compare our agent with a Monte-Carlo Tree Search (MCTS) method showing that our method is much more efficient than MCTS, and the performance of our agent can be further improved by combining with MCTS. The demo video is available at https://youtu.be/e_ZXVS5VutM.", "keywords": "deep reinforcement learning;task execution;instruction execution", "primary_area": "", "supplementary_material": "", "author": "Sungryull Sohn;Junhyuk Oh;Honglak Lee", "authorids": "srsohn@umich.edu;junhyuk@umich.edu;honglak@eecs.umich.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsohn2018neural,\ntitle={Neural Task Graph Execution},\nauthor={Sungryull Sohn and Junhyuk Oh and Honglak Lee},\nyear={2018},\nurl={https://openreview.net/forum?id=B1NOXfWR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1NOXfWR-", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;3;4", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:N4R5vo2gePoJ:scholar.google.com/&scioq=Neural+Task+Graph+Execution&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Spectral Normalization for Generative Adversarial Networks", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/331", "id": "B1QRgziT-", "author_site": "Takeru Miyato, Toshiki Kataoka, Masanori Koyama, Yuichi Yoshida", "tldr": "We propose a novel weight normalization technique called spectral normalization to stabilize the training of the discriminator of GANs.", "abstract": "One of the challenges in the study of generative adversarial networks is the instability of its training. \nIn this paper, we propose a novel weight normalization technique called spectral normalization to stabilize the training of the discriminator.\nOur new normalization technique is computationally light and easy to incorporate into existing implementations. \nWe tested the efficacy of spectral normalization on CIFAR10, STL-10, and ILSVRC2012 dataset, and we experimentally confirmed that spectrally normalized GANs (SN-GANs) is capable of generating images of better or equal quality relative to the previous training stabilization techniques. ", "keywords": "Generative Adversarial Networks;Deep Generative Models;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Takeru Miyato;Toshiki Kataoka;Masanori Koyama;Yuichi Yoshida", "authorids": "miyato@preferred.jp;kataoka@preferred.jp;koyama.masanori@gmail.com;yyoshida@nii.ac.jp", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nmiyato2018spectral,\ntitle={Spectral Normalization for Generative Adversarial Networks},\nauthor={Takeru Miyato and Toshiki Kataoka and Masanori Koyama and Yuichi Yoshida},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1QRgziT-},\n}", "github": "[![github](/images/github_icon.svg) pfnet-research/sngan_projection](https://github.com/pfnet-research/sngan_projection) + [![Papers with Code](/images/pwc_icon.svg) 36 community implementations](https://paperswithcode.com/paper/?openreview=B1QRgziT-)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;2;3", "rating_avg": 7.333333333333333, "confidence_avg": 3.0, "replies_avg": 31, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 5867, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=973410365172845184&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=B1QRgziT-", "pdf": "https://openreview.net/pdf?id=B1QRgziT-", "email": ";;;", "author_num": 4 }, { "title": "Empirical Risk Landscape Analysis for Understanding Deep Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/329", "id": "B1QgVti6Z", "author_site": "Pan Zhou, Jiashi Feng, Pan Zhou", "tldr": "", "abstract": "This work aims to provide comprehensive landscape analysis of empirical risk in deep neural networks (DNNs), including the convergence behavior of its gradient, its stationary points and the empirical risk itself to their corresponding population counterparts, which reveals how various network parameters determine the convergence performance. In particular, for an $l$-layer linear neural network consisting of $\\dm_i$ neurons in the $i$-th layer, we prove the gradient of its empirical risk uniformly converges to the one of its population risk, at the rate of $\\mathcal{O}(r^{2l} \\sqrt{l\\sqrt{\\max_i \\dm_i} s\\log(d/l)/n})$. Here $d$ is the total weight dimension, $s$ is the number of nonzero entries of all the weights and the magnitude of weights per layer is upper bounded by $r$. Moreover, we prove the one-to-one correspondence of the non-degenerate stationary points between the empirical and population risks and provide convergence guarantee for each pair. We also establish the uniform convergence of the empirical risk to its population counterpart and further derive the stability and generalization bounds for the empirical risk. In addition, we analyze these properties for deep \\emph{nonlinear} neural networks with sigmoid activation functions. We prove similar results for convergence behavior of their empirical risk gradients, non-degenerate stationary points as well as the empirical risk itself.\n\nTo our best knowledge, this work is the first one theoretically characterizing the uniform convergence of the gradient and stationary points of the empirical risk of DNN models, which benefits the theoretical understanding on how the neural network depth $l$, the layer width $\\dm_i$, the network size $d$, the sparsity in weight and the parameter magnitude $r$ determine the neural network landscape.", "keywords": "Deep Learning Analysis;Deep Learning Theory;Empirical Risk;Landscape Analysis;Nonconvex Optimization", "primary_area": "", "supplementary_material": "", "author": "Pan Zhou;Jiashi Feng", "authorids": "panzhou3@gmail.com;", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nzhou2018empirical,\ntitle={Empirical Risk Landscape Analysis for Understanding Deep Neural Networks},\nauthor={Pan Zhou and Jiashi Feng},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1QgVti6Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2", "pdf_size": 0, "rating": "3;7;7", "confidence": "3;3;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.0, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4445416685477578234&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=B1QgVti6Z", "pdf": "https://openreview.net/pdf?id=B1QgVti6Z", "email": ";", "author_num": 2 }, { "title": "Fidelity-Weighted Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/76", "id": "B1X0mzZCW", "author_site": "Mostafa Dehghani, Arash Mehrjou, Stephan Gouws, Jaap Kamps, Bernhard Schoelkopf", "tldr": "We propose Fidelity-weighted Learning, a semi-supervised teacher-student approach for training neural networks using weakly-labeled data.", "abstract": "Training deep neural networks requires many training samples, but in practice training labels are expensive to obtain and may be of varying quality, as some may be from trusted expert labelers while others might be from heuristics or other sources of weak supervision such as crowd-sourcing. This creates a fundamental quality- versus-quantity trade-off in the learning process. Do we learn from the small amount of high-quality data or the potentially large amount of weakly-labeled data? We argue that if the learner could somehow know and take the label-quality into account when learning the data representation, we could get the best of both worlds. To this end, we propose \u201cfidelity-weighted learning\u201d (FWL), a semi-supervised student- teacher approach for training deep neural networks using weakly-labeled data. FWL modulates the parameter updates to a student network (trained on the task we care about) on a per-sample basis according to the posterior confidence of its label-quality estimated by a teacher (who has access to the high-quality labels). Both student and teacher are learned from the data. We evaluate FWL on two tasks in information retrieval and natural language processing where we outperform state-of-the-art alternative semi-supervised methods, indicating that our approach makes better use of strong and weak labels, and leads to better task-dependent data representations.", "keywords": "fidelity-weighted learning;semisupervised learning;weakly-labeled data;teacher-student", "primary_area": "", "supplementary_material": "", "author": "Mostafa Dehghani;Arash Mehrjou;Stephan Gouws;Jaap Kamps;Bernhard Sch\u00f6lkopf", "authorids": "dehghani@uva.nl;amehrjou@tuebingen.mpg.de;sgouws@google.com;kamps@uva.nl;bs@tuebingen.mpg.de", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ndehghani2018fidelityweighted,\ntitle={Fidelity-Weighted Learning},\nauthor={Mostafa Dehghani and Arash Mehrjou and Stephan Gouws and Jaap Kamps and Bernhard Sch\u00f6lkopf},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1X0mzZCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;3;4", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 84, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6679593996852506297&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=B1X0mzZCW", "pdf": "https://openreview.net/pdf?id=B1X0mzZCW", "email": ";;;;", "author_num": 5 }, { "id": "B1X4DWWRb", "title": "Learning Weighted Representations for Generalization Across Designs", "track": "main", "status": "Reject", "tldr": "A theory and algorithmic framework for prediction under distributional shift, including causal effect estimation and domain adaptation", "abstract": "Predictive models that generalize well under distributional shift are often desirable and sometimes crucial to machine learning applications. One example is the estimation of treatment effects from observational data, where a subtask is to predict the effect of a treatment on subjects that are systematically different from those who received the treatment in the data. A related kind of distributional shift appears in unsupervised domain adaptation, where we are tasked with generalizing to a distribution of inputs that is different from the one in which we observe labels. We pose both of these problems as prediction under a shift in design. Popular methods for overcoming distributional shift are often heuristic or rely on assumptions that are rarely true in practice, such as having a well-specified model or knowing the policy that gave rise to the observed data. Other methods are hindered by their need for a pre-specified metric for comparing observations, or by poor asymptotic properties. In this work, we devise a bound on the generalization error under design shift, based on integral probability metrics and sample re-weighting. We combine this idea with representation learning, generalizing and tightening existing results in this space. Finally, we propose an algorithmic framework inspired by our bound and verify is effectiveness in causal effect estimation.", "keywords": "Distributional shift;causal effects;domain adaptation", "primary_area": "", "supplementary_material": "", "author": "Fredrik D. Johansson;Nathan Kallus;Uri Shalit;David Sontag", "authorids": "fredrikj@mit.edu;kallus@cornell.edu;urish22@gmail.com;dsontag@csail.mit.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nd.2018learning,\ntitle={Learning Weighted Representations for Generalization Across Designs},\nauthor={Fredrik D. Johansson and Nathan Kallus and Uri Shalit and David Sontag},\nyear={2018},\nurl={https://openreview.net/forum?id=B1X4DWWRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1X4DWWRb", "pdf_size": 0, "rating": "5;7;8", "confidence": "3;4;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.18898223650461363, "gs_citation": 98, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11319918029186302226&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Don't Decay the Learning Rate, Increase the Batch Size", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/272", "id": "B1Yy1BxCZ", "author_site": "Samuel Smith, Pieter-Jan Kindermans, Chris Ying, Quoc V Le", "tldr": "Decaying the learning rate and increasing the batch size during training are equivalent.", "abstract": "It is common practice to decay the learning rate. Here we show one can usually obtain the same learning curve on both training and test sets by instead increasing the batch size during training. This procedure is successful for stochastic gradient descent (SGD), SGD with momentum, Nesterov momentum, and Adam. It reaches equivalent test accuracies after the same number of training epochs, but with fewer parameter updates, leading to greater parallelism and shorter training times. We can further reduce the number of parameter updates by increasing the learning rate $\\epsilon$ and scaling the batch size $B \\propto \\epsilon$. Finally, one can increase the momentum coefficient $m$ and scale $B \\propto 1/(1-m)$, although this tends to slightly reduce the test accuracy. Crucially, our techniques allow us to repurpose existing training schedules for large batch training with no hyper-parameter tuning. We train ResNet-50 on ImageNet to 76.1% validation accuracy in under 30 minutes.", "keywords": "batch size;learning rate;simulated annealing;large batch training;scaling rules;stochastic gradient descent;sgd;imagenet;optimization", "primary_area": "", "supplementary_material": "", "author": "Samuel L. Smith;Pieter-Jan Kindermans;Chris Ying;Quoc V. Le", "authorids": "slsmith@google.com;pikinder@google.com;chrisying@google.com;qvl@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nl.2018dont,\ntitle={Don't Decay the Learning Rate, Increase the Batch Size},\nauthor={Samuel L. Smith and Pieter-Jan Kindermans and Quoc V. Le},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1Yy1BxCZ},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=B1Yy1BxCZ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 1362, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3840223745264283290&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=B1Yy1BxCZ", "pdf": "https://openreview.net/pdf?id=B1Yy1BxCZ", "email": ";;;", "author_num": 4 }, { "id": "B1Z3W-b0W", "title": "Learning to Infer", "track": "main", "status": "Workshop", "tldr": "We propose a new class of inference models that iteratively encode gradients to estimate approximate posterior distributions.", "abstract": "Inference models, which replace an optimization-based inference procedure with a learned model, have been fundamental in advancing Bayesian deep learning, the most notable example being variational auto-encoders (VAEs). In this paper, we propose iterative inference models, which learn how to optimize a variational lower bound through repeatedly encoding gradients. Our approach generalizes VAEs under certain conditions, and by viewing VAEs in the context of iterative inference, we provide further insight into several recent empirical findings. We demonstrate the inference optimization capabilities of iterative inference models, explore unique aspects of these models, and show that they outperform standard inference models on typical benchmark data sets.", "keywords": "Bayesian Deep Learning;Amortized Inference;Variational Auto-Encoders;Learning to Learn", "primary_area": "", "supplementary_material": "", "author": "Joseph Marino;Yisong Yue;Stephan Mandt", "authorids": "jmarino@caltech.edu;yyue@caltech.edu;stephan.mandt@disneyresearch.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmarino2018learning,\ntitle={Learning to Infer},\nauthor={Joseph Marino and Yisong Yue and Stephan Mandt},\nyear={2018},\nurl={https://openreview.net/forum?id=B1Z3W-b0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1Z3W-b0W", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;5", "rating_avg": 5.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13375112055355016532&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "B1ZZTfZAW", "title": "Real-valued (Medical) Time Series Generation with Recurrent Conditional GANs", "track": "main", "status": "Reject", "tldr": "Conditional recurrent GANs for real-valued medical sequences generation, showing novel evaluation approaches and an empirical privacy analysis.", "abstract": "Generative Adversarial Networks (GANs) have shown remarkable success as a framework for training models to produce realistic-looking data. In this work, we propose a Recurrent GAN (RGAN) and Recurrent Conditional GAN (RCGAN) to produce realistic real-valued multi-dimensional time series, with an emphasis on their application to medical data. RGANs make use of recurrent neural networks (RNNs) in the generator and the discriminator. In the case of RCGANs, both of these RNNs are conditioned on auxiliary information. We demonstrate our models in a set of toy datasets, where we show visually and quantitatively (using sample likelihood and maximum mean discrepancy) that they can successfully generate realistic time-series. We also describe novel evaluation methods for GANs, where we generate a synthetic labelled training dataset, and evaluate on a real test set the performance of a model trained on the synthetic data, and vice-versa. We illustrate with these metrics that RCGANs can generate time-series data useful for supervised training, with only minor degradation in performance on real test data. This is demonstrated on digit classification from \u2018serialised\u2019 MNIST and by training an early warning system on a medical dataset of 17,000 patients from an intensive care unit. We further discuss and analyse the privacy concerns that may arise when using RCGANs to generate realistic synthetic medical time series data, and demonstrate results from differentially private training of the RCGAN.", "keywords": "GAN;medical;records;time;series;generation;privacy", "primary_area": "", "supplementary_material": "", "author": "Stephanie Hyland;Crist\u00f3bal Esteban;Gunnar R\u00e4tsch", "authorids": "stephanie.hyland@inf.ethz.ch;cr_est@ethz.ch;raetsch@inf.ethz.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nhyland2018realvalued,\ntitle={Real-valued (Medical) Time Series Generation with Recurrent Conditional {GAN}s},\nauthor={Stephanie Hyland and Crist\u00f3bal Esteban and Gunnar R\u00e4tsch},\nyear={2018},\nurl={https://openreview.net/forum?id=B1ZZTfZAW},\n}", "github": "[![github](/images/github_icon.svg) ratschlab/RGAN](https://github.com/ratschlab/RGAN) + [![Papers with Code](/images/pwc_icon.svg) 5 community implementations](https://paperswithcode.com/paper/?openreview=B1ZZTfZAW)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1ZZTfZAW", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 1094, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4450697191042787143&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "title": "WRPN: Wide Reduced-Precision Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/208", "id": "B1ZvaaeAZ", "author_site": "Asit Mishra, Eriko Nurvitadhi, Jeffrey J Cook, Debbie Marr", "tldr": "Lowering precision (to 4-bits, 2-bits and even binary) and widening the filter banks gives as accurate network as those obtained with FP32 weights and activations.", "abstract": "For computer vision applications, prior works have shown the efficacy of reducing numeric precision of model parameters (network weights) in deep neural networks. Activation maps, however, occupy a large memory footprint during both the training and inference step when using mini-batches of inputs. One way to reduce this large memory footprint is to reduce the precision of activations. However, past works have shown that reducing the precision of activations hurts model accuracy. We study schemes to train networks from scratch using reduced-precision activations without hurting accuracy. We reduce the precision of activation maps (along with model parameters) and increase the number of filter maps in a layer, and find that this scheme matches or surpasses the accuracy of the baseline full-precision network. As a result, one can significantly improve the execution efficiency (e.g. reduce dynamic memory footprint, memory band- width and computational energy) and speed up the training and inference process with appropriate hardware support. We call our scheme WRPN -- wide reduced-precision networks. We report results and show that WRPN scheme is better than previously reported accuracies on ILSVRC-12 dataset while being computationally less expensive compared to previously reported reduced-precision networks.", "keywords": "Low precision;binary;ternary;4-bits networks", "primary_area": "", "supplementary_material": "", "author": "Asit Mishra;Eriko Nurvitadhi;Jeffrey J Cook;Debbie Marr", "authorids": "asit.k.mishra@intel.com;eriko.nurvitadhi@intel.com;jeffrey.j.cook@intel.com;debbie.marr@intel.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nmishra2018wrpn,\ntitle={{WRPN}: Wide Reduced-Precision Networks},\nauthor={Asit Mishra and Eriko Nurvitadhi and Jeffrey J Cook and Debbie Marr},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1ZvaaeAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;5;9", "confidence": "4;3;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": 0.5, "gs_citation": 382, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7570145345985295337&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=B1ZvaaeAZ", "pdf": "https://openreview.net/pdf?id=B1ZvaaeAZ", "email": ";;;", "author_num": 4 }, { "title": "Apprentice: Using Knowledge Distillation Techniques To Improve Low-Precision Network Accuracy", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/173", "id": "B1ae1lZRb", "author_site": "Asit Mishra, Debbie Marr", "tldr": "We show that knowledge transfer techniques can improve the accuracy of low precision networks and set new state-of-the-art accuracy for ternary and 4-bits precision. ", "abstract": "Deep learning networks have achieved state-of-the-art accuracies on computer vision workloads like image classification and object detection. The performant systems, however, typically involve big models with numerous parameters. Once trained, a challenging aspect for such top performing models is deployment on resource constrained inference systems -- the models (often deep networks or wide networks or both) are compute and memory intensive. Low precision numerics and model compression using knowledge distillation are popular techniques to lower both the compute requirements and memory footprint of these deployed models. In this paper, we study the combination of these two techniques and show that the performance of low precision networks can be significantly improved by using knowledge distillation techniques. We call our approach Apprentice and show state-of-the-art accuracies using ternary precision and 4-bit precision for many variants of ResNet architecture on ImageNet dataset. We study three schemes in which one can apply knowledge distillation techniques to various stages of the train-and-deploy pipeline.", "keywords": "Ternary;4-bits;low precision;knowledge distillation;knowledge transfer;model compression", "primary_area": "", "supplementary_material": "", "author": "Asit Mishra;Debbie Marr", "authorids": "asit.k.mishra@intel.com;debbie.marr@intel.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nmishra2018apprentice,\ntitle={Apprentice: Using Knowledge Distillation Techniques To Improve Low-Precision Network Accuracy},\nauthor={Asit Mishra and Debbie Marr},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1ae1lZRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 410, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4466439802890649800&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=B1ae1lZRb", "pdf": "https://openreview.net/pdf?id=B1ae1lZRb", "email": ";", "author_num": 2 }, { "title": "Overcoming Catastrophic Interference using Conceptor-Aided Backpropagation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/233", "id": "B1al7jg0b", "author_site": "Xu He, Herbert Jaeger", "tldr": "We propose a variant of the backpropagation algorithm, in which gradients are shielded by conceptors against degradation of previously learned tasks.", "abstract": "Catastrophic interference has been a major roadblock in the research of continual learning. Here we propose a variant of the back-propagation algorithm, \"Conceptor-Aided Backprop\" (CAB), in which gradients are shielded by conceptors against degradation of previously learned tasks. Conceptors have their origin in reservoir computing, where they have been previously shown to overcome catastrophic forgetting. CAB extends these results to deep feedforward networks. On the disjoint and permuted MNIST tasks, CAB outperforms two other methods for coping with catastrophic interference that have recently been proposed.", "keywords": "Catastrophic Interference;Conceptor;Backpropagation;Continual Learning;Lifelong Learning", "primary_area": "", "supplementary_material": "", "author": "Xu He;Herbert Jaeger", "authorids": "x.he@jacobs-university.de;h.jaeger@jacobs-university.de", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nhe2018overcoming,\ntitle={Overcoming Catastrophic Interference using Conceptor-Aided Backpropagation},\nauthor={Xu He and Herbert Jaeger},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1al7jg0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;3;5", "rating_avg": 7.0, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 127, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3069201795420316686&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=B1al7jg0b", "pdf": "https://openreview.net/pdf?id=B1al7jg0b", "email": ";", "author_num": 2 }, { "id": "B1bgpzZAZ", "title": "ElimiNet: A Model for Eliminating Options for Reading Comprehension with Multiple Choice Questions", "track": "main", "status": "Reject", "tldr": "A model combining elimination and selection for answering multiple choice questions", "abstract": "The task of Reading Comprehension with Multiple Choice Questions, requires a human (or machine) to read a given \\{\\textit{passage, question}\\} pair and select one of the $n$ given options. The current state of the art model for this task first computes a query-aware representation for the passage and then \\textit{selects} the option which has the maximum similarity with this representation. However, when humans perform this task they do not just focus on option selection but use a combination of \\textit{elimination} and \\textit{selection}. Specifically, a human would first try to eliminate the most irrelevant option and then read the document again in the light of this new information (and perhaps ignore portions corresponding to the eliminated option). This process could be repeated multiple times till the reader is finally ready to select the correct option. We propose \\textit{ElimiNet}, a neural network based model which tries to mimic this process. Specifically, it has gates which decide whether an option can be eliminated given the \\{\\textit{document, question}\\} pair and if so it tries to make the document representation orthogonal to this eliminatedd option (akin to ignoring portions of the document corresponding to the eliminated option). The model makes multiple rounds of partial elimination to refine the document representation and finally uses a selection module to pick the best option. We evaluate our model on the recently released large scale RACE dataset and show that it outperforms the current state of the art model on 7 out of the 13 question types in this dataset. Further we show that taking an ensemble of our \\textit{elimination-selection} based method with a \\textit{selection} based method gives us an improvement of 7\\% (relative) over the best reported performance on this dataset. \n", "keywords": "Reading Comprehension;Answering Multiple Choice Questions", "primary_area": "", "supplementary_material": "", "author": "Soham Parikh;Ananya Sai;Preksha Nema;Mitesh M Khapra", "authorids": "sohamp@cse.iitm.ac.in;ananyasb@cse.iitm.ac.in;preksha@cse.iitm.ac.in;miteshk@cse.iitm.ac.in", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nparikh2018eliminet,\ntitle={ElimiNet: A Model for Eliminating Options for Reading Comprehension with Multiple Choice Questions},\nauthor={Soham Parikh and Ananya Sai and Preksha Nema and Mitesh M Khapra},\nyear={2018},\nurl={https://openreview.net/forum?id=B1bgpzZAZ},\n}", "github": "[![github](/images/github_icon.svg) sohamparikh94/ElimiNet](https://github.com/sohamparikh94/ElimiNet)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1bgpzZAZ", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;3;3", "rating_avg": 4.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=644655731157023640&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "title": "A Compressed Sensing View of Unsupervised Text Embeddings, Bag-of-n-Grams, and LSTMs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/96", "id": "B1e5ef-C-", "author_site": "Sanjeev Arora, Mikhail Khodak, Nikunj Umesh Saunshi, Kiran Vodrahalli", "tldr": "We use the theory of compressed sensing to prove that LSTMs can do at least as well on linear text classification as Bag-of-n-Grams.", "abstract": "Low-dimensional vector embeddings, computed using LSTMs or simpler techniques, are a popular approach for capturing the \u201cmeaning\u201d of text and a form of unsupervised learning useful for downstream tasks. However, their power is not theoretically understood. The current paper derives formal understanding by looking at the subcase of linear embedding schemes. Using the theory of compressed sensing we show that representations combining the constituent word vectors are essentially information-preserving linear measurements of Bag-of-n-Grams (BonG) representations of text. This leads to a new theoretical result about LSTMs: low-dimensional embeddings derived from a low-memory LSTM are provably at least as powerful on classification tasks, up to small error, as a linear classifier over BonG vectors, a result that extensive empirical work has thus far been unable to show. Our experiments support these theoretical findings and establish strong, simple, and unsupervised baselines on standard benchmarks that in some cases are state of the art among word-level methods. We also show a surprising new property of embeddings such as GloVe and word2vec: they form a good sensing matrix for text that is more efficient than random matrices, the standard sparse recovery tool, which may explain why they lead to better representations in practice.", "keywords": "theory;LSTM;unsupervised learning;word embeddings;compressed sensing;sparse recovery;document representation;text classification", "primary_area": "", "supplementary_material": "", "author": "Sanjeev Arora;Mikhail Khodak;Nikunj Saunshi;Kiran Vodrahalli", "authorids": "arora@cs.princeton.edu;mkhodak@princeton.edu;nsaunshi@cs.princeton.edu;kiran.vodrahalli@columbia.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\narora2018a,\ntitle={A Compressed Sensing View of Unsupervised Text Embeddings, Bag-of-n-Grams, and {LSTM}s},\nauthor={Sanjeev Arora and Mikhail Khodak and Nikunj Saunshi and Kiran Vodrahalli},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1e5ef-C-},\n}", "github": "[![github](/images/github_icon.svg) NLPrinceton/text_embedding](https://github.com/NLPrinceton/text_embedding) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=B1e5ef-C-)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;3;1", "rating_avg": 6.666666666666667, "confidence_avg": 2.6666666666666665, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": -0.7559289460184545, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7865413109466847289&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=B1e5ef-C-", "pdf": "https://openreview.net/pdf?id=B1e5ef-C-", "email": ";;;", "author_num": 4 }, { "title": "Characterizing Adversarial Subspaces Using Local Intrinsic Dimensionality", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/328", "id": "B1gJ1L2aW", "author_site": "Xingjun Ma, Bo Li, Yisen Wang, Sarah Erfani, Sudanthi Wijewickrema, Grant Schoenebeck, Dawn Song, Michael E Houle, James Bailey", "tldr": "We characterize the dimensional properties of adversarial subspaces in the neighborhood of adversarial examples via the use of Local Intrinsic Dimensionality (LID).", "abstract": "Deep Neural Networks (DNNs) have recently been shown to be vulnerable against adversarial examples, which are carefully crafted instances that can mislead DNNs to make errors during prediction. To better understand such attacks, a characterization is needed of the properties of regions (the so-called `adversarial subspaces') in which adversarial examples lie. We tackle this challenge by characterizing the dimensional properties of adversarial regions, via the use of Local Intrinsic Dimensionality (LID). LID assesses the space-filling capability of the region surrounding a reference example, based on the distance distribution of the example to its neighbors. We first provide explanations about how adversarial perturbation can affect the LID characteristic of adversarial regions, and then show empirically that LID characteristics can facilitate the distinction of adversarial examples generated using state-of-the-art attacks. As a proof-of-concept, we show that a potential application of LID is to distinguish adversarial examples, and the preliminary results show that it can outperform several state-of-the-art detection measures by large margins for five attack strategies considered in this paper across three benchmark datasets. Our analysis of the LID characteristic for adversarial regions not only motivates new directions of effective adversarial defense, but also opens up more challenges for developing new attacks to better understand the vulnerabilities of DNNs.", "keywords": "Adversarial Subspace;Local Intrinsic Dimensionality;Deep Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Xingjun Ma;Bo Li;Yisen Wang;Sarah M. Erfani;Sudanthi Wijewickrema;Grant Schoenebeck;Dawn Song;Michael E. Houle;James Bailey", "authorids": "xingjunm@student.unimelb.edu.au;crystalboli@berkeley.edu;wangys14@mails.tsinghua.edu.cn;sarah.erfani@unimelb.edu.au;sudanthi.wijewickrema@unimelb.edu.au;schoeneb@umich.edu;dawnsong.travel@gmail.com;meh@nii.ac.jp;baileyj@unimelb.edu.au", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "@inproceedings{\nma2018characterizing,\ntitle={Characterizing Adversarial Subspaces Using Local Intrinsic Dimensionality},\nauthor={Xingjun Ma and Bo Li and Yisen Wang and Sarah M. Erfani and Sudanthi Wijewickrema and Grant Schoenebeck and Michael E. Houle and Dawn Song and James Bailey},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1gJ1L2aW},\n}", "github": "[![github](/images/github_icon.svg) xingjunm/lid_adversarial_subspace_detection](https://github.com/xingjunm/lid_adversarial_subspace_detection)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;8", "confidence": "1;4;3", "rating_avg": 7.0, "confidence_avg": 2.6666666666666665, "replies_avg": 17, "authors#_avg": 9, "corr_rating_confidence": 0.6546536707079771, "gs_citation": 903, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17134144151462669065&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 16, "openreview": "https://openreview.net/forum?id=B1gJ1L2aW", "pdf": "https://openreview.net/pdf?id=B1gJ1L2aW", "email": ";;;;;;;;", "author_num": 9 }, { "title": "On the regularization of Wasserstein GANs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/17", "id": "B1hYRMbCW", "author_site": "Henning Petzka, Asja Fischer, Denis Lukovnikov", "tldr": "A new regularization term can improve your training of wasserstein gans", "abstract": "Since their invention, generative adversarial networks (GANs) have become a popular approach for learning to model a distribution of real (unlabeled) data. Convergence problems during training are overcome by Wasserstein GANs which minimize the distance between the model and the empirical distribution in terms of a different metric, but thereby introduce a Lipschitz constraint into the optimization problem. A simple way to enforce the Lipschitz constraint on the class of functions, which can be modeled by the neural network, is weight clipping. Augmenting the loss by a regularization term that penalizes the deviation of the gradient norm of the critic (as a function of the network's input) from one, was proposed as an alternative that improves training. We present theoretical arguments why using a weaker regularization term enforcing the Lipschitz constraint is preferable. These arguments are supported by experimental results on several data sets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Henning Petzka;Asja Fischer;Denis Lukovnikov", "authorids": "henning.petzka@iais.fraunhofer.de;asja.fischer@gmail.com;lukovnik@cs.uni-bonn.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\npetzka2018on,\ntitle={On the regularization of Wasserstein {GAN}s},\nauthor={Henning Petzka and Asja Fischer and Denis Lukovnikov},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1hYRMbCW},\n}", "github": "[![github](/images/github_icon.svg) lukovnikov/improved_wgan_training](https://github.com/lukovnikov/improved_wgan_training) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=B1hYRMbCW)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "2;6;7", "confidence": "2;5;4", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844386, "gs_citation": 309, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16449463251581049938&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=B1hYRMbCW", "pdf": "https://openreview.net/pdf?id=B1hYRMbCW", "email": ";;", "author_num": 3 }, { "title": "N2N learning: Network to Network Compression via Policy Gradient Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/132", "id": "B1hcZZ-AW", "author_site": "Anubhav Ashok, Nicholas Rhinehart, Fares Beainy, Kris M Kitani", "tldr": "A novel reinforcement learning based approach to compress deep neural networks with knowledge distillation", "abstract": "While bigger and deeper neural network architectures continue to advance the state-of-the-art for many computer vision tasks, real-world adoption of these networks is impeded by hardware and speed constraints. Conventional model compression methods attempt to address this problem by modifying the architecture manually or using pre-defined heuristics. Since the space of all reduced architectures is very large, modifying the architecture of a deep neural network in this way is a difficult task. In this paper, we tackle this issue by introducing a principled method for learning reduced network architectures in a data-driven way using reinforcement learning. Our approach takes a larger 'teacher' network as input and outputs a compressed 'student' network derived from the 'teacher' network. In the first stage of our method, a recurrent policy network aggressively removes layers from the large 'teacher' model. In the second stage, another recurrent policy network carefully reduces the size of each remaining layer. The resulting network is then evaluated to obtain a reward -- a score based on the accuracy and compression of the network. Our approach uses this reward signal with policy gradients to train the policies to find a locally optimal student network. Our experiments show that we can achieve compression rates of more than 10x for models such as ResNet-34 while maintaining similar performance to the input 'teacher' network. We also present a valuable transfer learning result which shows that policies which are pre-trained on smaller 'teacher' networks can be used to rapidly speed up training on larger 'teacher' networks.", "keywords": "Deep learning;Neural networks;Model compression", "primary_area": "", "supplementary_material": "", "author": "Anubhav Ashok;Nicholas Rhinehart;Fares Beainy;Kris M. Kitani", "authorids": "anubhava@andrew.cmu.edu;nrhineha@cs.cmu.edu;fares.beainy@volvo.com;kkitani@cs.cmu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nashok2018nn,\ntitle={N2N learning: Network to Network Compression via Policy Gradient Reinforcement Learning},\nauthor={Anubhav Ashok and Nicholas Rhinehart and Fares Beainy and Kris M. Kitani},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1hcZZ-AW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "4;5;9", "confidence": "4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 232, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=871626235713849785&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=B1hcZZ-AW", "pdf": "https://openreview.net/pdf?id=B1hcZZ-AW", "email": ";;;", "author_num": 4 }, { "id": "B1i7ezW0-", "title": "Semi-Supervised Learning via New Deep Network Inversion", "track": "main", "status": "Reject", "tldr": "We exploit an inversion scheme for arbitrary deep neural networks to develop a new semi-supervised learning framework applicable to many topologies.", "abstract": "We exploit a recently derived inversion scheme for arbitrary deep neural networks to develop a new semi-supervised learning framework that applies to a wide range of systems and problems. \nThe approach reaches current state-of-the-art methods on MNIST and provides reasonable performances on SVHN and CIFAR10. Through the introduced method, residual networks are for the first time applied to semi-supervised tasks. Experiments with one-dimensional signals highlight the generality of the method. Importantly, our approach is simple, efficient, and requires no change in the deep network architecture.", "keywords": "inversion scheme;deep neural networks;semi-supervised learning;MNIST;SVHN;CIFAR10", "primary_area": "", "supplementary_material": "", "author": "Balestriero R.;Roger V.;Glotin H.;Baraniuk R.", "authorids": "randallbalestriero@gmail.com;roger.dyni@gmail.com;herve.glotin@univ-tln.fr;richb@rice.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nr.2018semisupervised,\ntitle={Semi-Supervised Learning via New Deep Network Inversion},\nauthor={Balestriero R. and Roger V. and Glotin H. and Baraniuk R.},\nyear={2018},\nurl={https://openreview.net/forum?id=B1i7ezW0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1i7ezW0-", "pdf_size": 0, "rating": "4;5;7", "confidence": "5;4;2", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": -1.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15627814190977745964&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "title": "Divide and Conquer Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/44", "id": "B1jscMbAW", "author_site": "Alex Nowak, David Folqu\u00e9 Garcia, Joan Bruna", "tldr": "Dynamic model that learns divide and conquer strategies by weak supervision.", "abstract": "We consider the learning of algorithmic tasks by mere observation of input-output\npairs. Rather than studying this as a black-box discrete regression problem with\nno assumption whatsoever on the input-output mapping, we concentrate on tasks\nthat are amenable to the principle of divide and conquer, and study what are its\nimplications in terms of learning.\nThis principle creates a powerful inductive bias that we leverage with neural\narchitectures that are defined recursively and dynamically, by learning two scale-\ninvariant atomic operations: how to split a given input into smaller sets, and how\nto merge two partially solved tasks into a larger partial solution. Our model can be\ntrained in weakly supervised environments, namely by just observing input-output\npairs, and in even weaker environments, using a non-differentiable reward signal.\nMoreover, thanks to the dynamic aspect of our architecture, we can incorporate\nthe computational complexity as a regularization term that can be optimized by\nbackpropagation. We demonstrate the flexibility and efficiency of the Divide-\nand-Conquer Network on several combinatorial and geometric tasks: convex hull,\nclustering, knapsack and euclidean TSP. Thanks to the dynamic programming\nnature of our model, we show significant improvements in terms of generalization\nerror and computational complexity.", "keywords": "Neural Networks;Combinatorial Optimization;Algorithms", "primary_area": "", "supplementary_material": "", "author": "Alex Nowak;David Folqu\u00e9;Joan Bruna", "authorids": "alexnowakvila@gmail.com;david.folque@gmail.com;bruna@cims.nyu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nnowak2018divide,\ntitle={Divide and Conquer Networks},\nauthor={Alex Nowak and David Folqu\u00e9 and Joan Bruna},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1jscMbAW},\n}", "github": "[![github](/images/github_icon.svg) alexnowakvila/DiCoNet](https://github.com/alexnowakvila/DiCoNet)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;3;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.0, "replies_avg": 6, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=337144182014397579&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=B1jscMbAW", "pdf": "https://openreview.net/pdf?id=B1jscMbAW", "email": ";;", "author_num": 3 }, { "id": "B1kIr-WRb", "title": "LEARNING SEMANTIC WORD RESPRESENTATIONS VIA TENSOR FACTORIZATION", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many state-of-the-art word embedding techniques involve factorization of a cooccurrence\nbased matrix. We aim to extend this approach by studying word embedding\ntechniques that involve factorization of co-occurrence based tensors (N-\nway arrays). We present two new word embedding techniques based on tensor\nfactorization and show that they outperform common methods on several semantic\nNLP tasks when given the same data. To train one of the embeddings, we present\na new joint tensor factorization problem and an approach for solving it. Furthermore,\nwe modify the performance metrics for the Outlier Detection Camacho-\nCollados & Navigli (2016) task to measure the quality of higher-order relationships\nthat a word embedding captures. Our tensor-based methods significantly\noutperform existing methods at this task when using our new metric. Finally, we\ndemonstrate that vectors in our embeddings can be composed multiplicatively to\ncreate different vector representations for each meaning of a polysemous word.\nWe show that this property stems from the higher order information that the vectors\ncontain, and thus is unique to our tensor based embeddings.", "keywords": "Word Embeddings;Tensor Factorization;Natural Language Processing", "primary_area": "", "supplementary_material": "", "author": "Eric Bailey;Charles Meyer;Shuchin Aeron", "authorids": "popcorncolonel@gmail.com;cmey63@gmail.com;shuchin@ece.tufts.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbailey2018learning,\ntitle={{LEARNING} {SEMANTIC} {WORD} {RESPRESENTATIONS} {VIA} {TENSOR} {FACTORIZATION}},\nauthor={Eric Bailey and Charles Meyer and Shuchin Aeron},\nyear={2018},\nurl={https://openreview.net/forum?id=B1kIr-WRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1kIr-WRb", "pdf_size": 0, "rating": "5;5;5", "confidence": "3;5;5", "rating_avg": 5.0, "confidence_avg": 4.333333333333333, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4705534148828492955&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Non-Autoregressive Neural Machine Translation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/241", "id": "B1l8BtlCb", "author_site": "Jiatao Gu, James Bradbury, Caiming Xiong, Victor OK Li, richard socher", "tldr": "We introduce the first NMT model with fully parallel decoding, reducing inference latency by 10x.", "abstract": "Existing approaches to neural machine translation condition each output word on previously generated outputs. We introduce a model that avoids this autoregressive property and produces its outputs in parallel, allowing an order of magnitude lower latency during inference. Through knowledge distillation, the use of input token fertilities as a latent variable, and policy gradient fine-tuning, we achieve this at a cost of as little as 2.0 BLEU points relative to the autoregressive Transformer network used as a teacher. We demonstrate substantial cumulative improvements associated with each of the three aspects of our training strategy, and validate our approach on IWSLT 2016 English\u2013German and two WMT language pairs. By sampling fertilities in parallel at inference time, our non-autoregressive model achieves near-state-of-the-art performance of 29.8 BLEU on WMT 2016 English\u2013Romanian.", "keywords": "machine translation;non-autoregressive;transformer;fertility;nmt", "primary_area": "", "supplementary_material": "", "author": "Jiatao Gu;James Bradbury;Caiming Xiong;Victor O.K. Li;Richard Socher", "authorids": "jiataogu@eee.hku.hk;james.bradbury@salesforce.com;cxiong@salesforce.com;vli@eee.hku.hk;rsocher@salesforce.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ngu2018nonautoregressive,\ntitle={Non-Autoregressive Neural Machine Translation},\nauthor={Jiatao Gu and James Bradbury and Caiming Xiong and Victor O.K. Li and Richard Socher},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1l8BtlCb},\n}", "github": "[![github](/images/github_icon.svg) salesforce/nonauto-nmt](https://github.com/salesforce/nonauto-nmt) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=B1l8BtlCb)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 933, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3482831974828539059&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=B1l8BtlCb", "pdf": "https://openreview.net/pdf?id=B1l8BtlCb", "email": ";;;;", "author_num": 5 }, { "id": "B1lMMx1CW", "title": "THE EFFECTIVENESS OF A TWO-LAYER NEURAL NETWORK FOR RECOMMENDATIONS", "track": "main", "status": "Workshop", "tldr": "Improving recommendations using time sensitive modeling with neural networks in multiple product categories on a retail website", "abstract": "We present a personalized recommender system using neural network for recommending\nproducts, such as eBooks, audio-books, Mobile Apps, Video and Music.\nIt produces recommendations based on customer\u2019s implicit feedback history such\nas purchases, listens or watches. Our key contribution is to formulate recommendation\nproblem as a model that encodes historical behavior to predict the future\nbehavior using soft data split, combining predictor and auto-encoder models. We\nintroduce convolutional layer for learning the importance (time decay) of the purchases\ndepending on their purchase date and demonstrate that the shape of the time\ndecay function can be well approximated by a parametrical function. We present\noffline experimental results showing that neural networks with two hidden layers\ncan capture seasonality changes, and at the same time outperform other modeling\ntechniques, including our recommender in production. Most importantly, we\ndemonstrate that our model can be scaled to all digital categories, and we observe\nsignificant improvements in an online A/B test. We also discuss key enhancements\nto the neural network model and describe our production pipeline. Finally\nwe open-sourced our deep learning library which supports multi-gpu model parallel\ntraining. This is an important feature in building neural network based recommenders\nwith large dimensionality of input and output data.", "keywords": "Recommender systems;deep learning;personalization", "primary_area": "", "supplementary_material": "", "author": "Oleg Rybakov;Vijai Mohan;Avishkar Misra;Scott LeGrand;Rejith Joseph;Kiuk Chung;Siddharth Singh;Qian You;Eric Nalisnick;Leo Dirac;Runfei Luo", "authorids": "rybakovo@amazon.com;vijaim@amazon.com;avishkar@gmail.com;slegrand@a9.com;rgeorgej@amazon.com;kiuk@amazon.com;singsidd@amazon.com;qian.you@snapchat.com;enalisni@uci.edu;leodirac@amazon.com;rluo@pstat.ucsb.edu", "gender": ";;;;;;;;;;", "homepage": ";;;;;;;;;;", "dblp": ";;;;;;;;;;", "google_scholar": ";;;;;;;;;;", "orcid": ";;;;;;;;;;", "linkedin": ";;;;;;;;;;", "or_profile": ";;;;;;;;;;", "aff": ";;;;;;;;;;", "aff_domain": ";;;;;;;;;;", "position": ";;;;;;;;;;", "bibtex": "@misc{\nrybakov2018the,\ntitle={{THE} {EFFECTIVENESS} {OF} A {TWO}-{LAYER} {NEURAL} {NETWORK} {FOR} {RECOMMENDATIONS}},\nauthor={Oleg Rybakov and Vijai Mohan and Avishkar Misra and Scott LeGrand and Rejith Joseph and Kiuk Chung and Siddharth Singh and Qian You and Eric Nalisnick and Leo Dirac and Runfei Luo},\nyear={2018},\nurl={https://openreview.net/forum?id=B1lMMx1CW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1lMMx1CW", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;4;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 8, "authors#_avg": 11, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17533937007309563051&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "B1mAkPxCZ", "title": "VOCABULARY-INFORMED VISUAL FEATURE AUGMENTATION FOR ONE-SHOT LEARNING", "track": "main", "status": "Reject", "tldr": "", "abstract": "A natural solution for one-shot learning is to augment training data to handle the data deficiency problem. However, directly augmenting in the image domain may not necessarily generate training data that sufficiently explore the intra-class space for one-shot classification. Inspired by the recent vocabulary-informed learning, we propose to generate synthetic training data with the guide of the semantic word space. Essentially, we train an auto-encoder as a bridge to enable the transformation between the image feature space and the semantic space. Besides directly augmenting image features, we transform the image features to semantic space using the encoder and perform the data augmentation. The decoder then synthesizes the image features for the augmented instances from the semantic space. Experiments on three datasets show that our data augmentation method effectively improves the performance of one-shot classification. An extensive study shows that data augmented from semantic space are complementary with those from the image space, and thus boost the classification accuracy dramatically. Source code and dataset will be available. ", "keywords": "vocabulary-informed learning;data augmentation", "primary_area": "", "supplementary_material": "", "author": "jianqi ma;hangyu lin;yinda zhang;yanwei fu;xiangyang xue", "authorids": ";;;;", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nma2018vocabularyinformed,\ntitle={{VOCABULARY}-{INFORMED} {VISUAL} {FEATURE} {AUGMENTATION} {FOR} {ONE}-{SHOT} {LEARNING}},\nauthor={jianqi ma and hangyu lin and yinda zhang and yanwei fu and xiangyang xue},\nyear={2018},\nurl={https://openreview.net/forum?id=B1mAkPxCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=B1mAkPxCZ", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;4;4", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 5, "corr_rating_confidence": 0.8660254037844385, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "B1mSWUxR-", "title": "Softmax Q-Distribution Estimation for Structured Prediction: A Theoretical Interpretation for RAML", "track": "main", "status": "Reject", "tldr": "", "abstract": "Reward augmented maximum likelihood (RAML), a simple and effective learning framework to directly optimize towards the reward function in structured prediction tasks, has led to a number of impressive empirical successes. RAML incorporates task-specific reward by performing maximum-likelihood updates on candidate outputs sampled according to an exponentiated payoff distribution, which gives higher probabilities to candidates that are close to the reference output. While RAML is notable for its simplicity, efficiency, and its impressive empirical successes, the theoretical properties of RAML, especially the behavior of the exponentiated payoff distribution, has not been examined thoroughly. In this work, we introduce softmax Q-distribution estimation, a novel theoretical interpretation of RAML, which reveals the relation between RAML and Bayesian decision theory. The softmax Q-distribution can be regarded as a smooth approximation of the Bayes decision boundary, and the Bayes decision rule is achieved by decoding with this Q-distribution. We further show that RAML is equivalent to approximately estimating the softmax Q-distribution, with the temperature $\\tau$ controlling approximation error. We perform two experiments, one on synthetic data of multi-class classification and one on real data of image captioning, to demonstrate the relationship between RAML and the proposed softmax Q-distribution estimation, verifying our theoretical analysis. Additional experiments on three structured prediction tasks with rewards defined on sequential (named entity recognition), tree-based (dependency parsing) and irregular (machine translation) structures show notable improvements over maximum likelihood baselines.", "keywords": "structured prediction;RAML;theory;Bayes decision rule;reward function", "primary_area": "", "supplementary_material": "", "author": "Xuezhe Ma;Pengcheng Yin;Jingzhou Liu;Graham Neubig;Eduard Hovy", "authorids": "xuezhem@cs.cmu.edu;pcyin@cs.cmu.edu;liujingzhou@cs.cmu.edu;gneubig@cs.cmu.edu;hovy@cs.cmu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nma2018softmax,\ntitle={Softmax Q-Distribution Estimation for Structured Prediction: A Theoretical Interpretation for {RAML}},\nauthor={Xuezhe Ma and Pengcheng Yin and Jingzhou Liu and Graham Neubig and Eduard Hovy},\nyear={2018},\nurl={https://openreview.net/forum?id=B1mSWUxR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1mSWUxR-", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;2;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.0, "replies_avg": 8, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8309904459036874771&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "title": "Universal Agent for Disentangling Environments and Tasks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/3", "id": "B1mvVm-C-", "author_site": "Jiayuan Mao, Honghua Dong, Joseph J Lim", "tldr": "We propose a DRL framework that disentangles task and environment specific knowledge.", "abstract": "Recent state-of-the-art reinforcement learning algorithms are trained under the goal of excelling in one specific task. Hence, both environment and task specific knowledge are entangled into one framework. However, there are often scenarios where the environment (e.g. the physical world) is fixed while only the target task changes. Hence, borrowing the idea from hierarchical reinforcement learning, we propose a framework that disentangles task and environment specific knowledge by separating them into two units. The environment-specific unit handles how to move from one state to the target state; and the task-specific unit plans for the next target state given a specific task. The extensive results in simulators indicate that our method can efficiently separate and learn two independent units, and also adapt to a new task more efficiently than the state-of-the-art methods.", "keywords": "reinforcement learning;transfer learning", "primary_area": "", "supplementary_material": "", "author": "Jiayuan Mao;Honghua Dong;Joseph J. Lim", "authorids": "mjy14@mails.tsinghua.edu.cn;dhh14@mails.tsinghua.edu.cn;limjj@usc.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nmao2018universal,\ntitle={Universal Agent for Disentangling Environments and Tasks},\nauthor={Jiayuan Mao and Honghua Dong and Joseph J. Lim},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1mvVm-C-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;3;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15599654995562798125&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=B1mvVm-C-", "pdf": "https://openreview.net/pdf?id=B1mvVm-C-", "email": ";;", "author_num": 3 }, { "title": "Generalizing Hamiltonian Monte Carlo with Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/284", "id": "B1n8LexRZ", "author_site": "Daniel Levy, Matthew D Hoffman, Jascha Sohl-Dickstein", "tldr": "General method to train expressive MCMC kernels parameterized with deep neural networks. Given a target distribution p, our method provides a fast-mixing sampler, able to efficiently explore the state space.", "abstract": "We present a general-purpose method to train Markov chain Monte Carlo kernels, parameterized by deep neural networks, that converge and mix quickly to their target distribution. Our method generalizes Hamiltonian Monte Carlo and is trained to maximize expected squared jumped distance, a proxy for mixing speed. We demonstrate large empirical gains on a collection of simple but challenging distributions, for instance achieving a 106x improvement in effective sample size in one case, and mixing when standard HMC makes no measurable progress in a second. Finally, we show quantitative and qualitative gains on a real-world task: latent-variable generative modeling. Python source code will be open-sourced with the camera-ready paper.", "keywords": "markov;chain;monte;carlo;sampling;posterior;deep;learning;hamiltonian;mcmc", "primary_area": "", "supplementary_material": "", "author": "Daniel Levy;Matt D. Hoffman;Jascha Sohl-Dickstein", "authorids": "danilevy@cs.stanford.edu;mhoffman@google.com;jaschasd@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nlevy2018generalizing,\ntitle={Generalizing Hamiltonian Monte Carlo with Neural Networks},\nauthor={Daniel Levy and Matt D. Hoffman and Jascha Sohl-Dickstein},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1n8LexRZ},\n}", "github": "[![github](/images/github_icon.svg) brain-research/l2hmc](https://github.com/brain-research/l2hmc) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=B1n8LexRZ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;4;2", "rating_avg": 7.0, "confidence_avg": 3.0, "replies_avg": 17, "authors#_avg": 3, "corr_rating_confidence": -0.5, "gs_citation": 157, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6189563132756829558&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=B1n8LexRZ", "pdf": "https://openreview.net/pdf?id=B1n8LexRZ", "email": ";;", "author_num": 3 }, { "id": "B1nLkl-0Z", "title": "Learning Gaussian Policies from Smoothed Action Value Functions", "track": "main", "status": "Reject", "tldr": "We propose a new Q-value function that enables better learning of Gaussian policies.", "abstract": "State-action value functions (i.e., Q-values) are ubiquitous in reinforcement learning (RL), giving rise to popular algorithms such as SARSA and Q-learning. We propose a new notion of action value defined by a Gaussian smoothed version of the expected Q-value used in SARSA. We show that such smoothed Q-values still satisfy a Bellman equation, making them naturally learnable from experience sampled from an environment. Moreover, the gradients of expected reward with respect to the mean and covariance of a parameterized Gaussian policy can be recovered from the gradient and Hessian of the smoothed Q-value function. Based on these relationships we develop new algorithms for training a Gaussian policy directly from a learned Q-value approximator. The approach is also amenable to proximal optimization techniques by augmenting the objective with a penalty on KL-divergence from a previous policy. We find that the ability to learn both a mean and covariance during training allows this approach to achieve strong results on standard continuous control benchmarks.", "keywords": "Reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Ofir Nachum;Mohammad Norouzi;George Tucker;Dale Schuurmans", "authorids": "ofirnachum@google.com;mnorouzi@google.com;gjt@google.com;schuurmans@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nnachum2018learning,\ntitle={Learning Gaussian Policies from Smoothed Action Value Functions},\nauthor={Ofir Nachum and Mohammad Norouzi and George Tucker and Dale Schuurmans},\nyear={2018},\nurl={https://openreview.net/forum?id=B1nLkl-0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1nLkl-0Z", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18403620706940514274&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Learning to Multi-Task by Active Sampling", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/257", "id": "B1nZ1weCZ", "author_site": "Sahil Sharma, Ashutosh Kumar Jha, Parikshit Hegde, Balaraman Ravindran", "tldr": "Letting a meta-learner decide the task to train on for an agent in a multi-task setting improves multi-tasking ability substantially", "abstract": "One of the long-standing challenges in Artificial Intelligence for learning goal-directed behavior is to build a single agent which can solve multiple tasks. Recent progress in multi-task learning for goal-directed sequential problems has been in the form of distillation based learning wherein a student network learns from multiple task-specific expert networks by mimicking the task-specific policies of the expert networks. While such approaches offer a promising solution to the multi-task learning problem, they require supervision from large expert networks which require extensive data and computation time for training.\nIn this work, we propose an efficient multi-task learning framework which solves multiple goal-directed tasks in an on-line setup without the need for expert supervision. Our work uses active learning principles to achieve multi-task learning by sampling the harder tasks more than the easier ones. We propose three distinct models under our active sampling framework. An adaptive method with extremely competitive multi-tasking performance. A UCB-based meta-learner which casts the problem of picking the next task to train on as a multi-armed bandit problem. A meta-learning method that casts the next-task picking problem as a full Reinforcement Learning problem and uses actor-critic methods for optimizing the multi-tasking performance directly. We demonstrate results in the Atari 2600 domain on seven multi-tasking instances: three 6-task instances, one 8-task instance, two 12-task instances and one 21-task instance.", "keywords": "Deep Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Sahil Sharma*;Ashutosh Kumar Jha*;Parikshit S Hegde;Balaraman Ravindran", "authorids": "sahil@cse.iitm.ac.in;me14b148@smail.iitm.ac.in;ee14b123@ee.iitm.ac.in;ravi@cse.iitm.ac.in", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nsharma2018learning,\ntitle={Learning to Multi-Task by Active Sampling},\nauthor={Sahil Sharma and Ashutosh Kumar Jha and Parikshit S Hegde and Balaraman Ravindran},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1nZ1weCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;7;7", "confidence": "3;3;5", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": 0.5, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7942202814055616408&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=B1nZ1weCZ", "pdf": "https://openreview.net/pdf?id=B1nZ1weCZ", "email": ";;;", "author_num": 4 }, { "id": "B1nxTzbRZ", "title": "Forward Modeling for Partial Observation Strategy Games - A StarCraft Defogger", "track": "main", "status": "Reject", "tldr": "This paper presents a defogger, a model that learns to predict future hidden information from partial observations, applied to a StarCraft dataset.", "abstract": "This paper we present a defogger, a model that learns to predict future hidden information from partial observations. We formulate this model in the context of forward modeling and leverage spatial and sequential constraints and correlations via convolutional neural networks and long short-term memory networks, respectively. We evaluate our approach on a large dataset of human games of StarCraft: Brood War, a real-time strategy video game. Our models consistently beat strong rule-based baselines and qualitatively produce sensible future game states.", "keywords": "forward modeling;partially observable;deep learning;strategy game;real-time strategy", "primary_area": "", "supplementary_material": "", "author": "Gabriel Synnaeve;Zeming Lin;Jonas Gehring;Vasil Khalidov;Nicolas Carion;Nicolas Usunier", "authorids": "gab@fb.com;zlin@fb.com;jgehring@fb.com;vkhalidov@fb.com;alcinos@fb.com;usunier@fb.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nsynnaeve2018forward,\ntitle={Forward Modeling for Partial Observation Strategy Games - A StarCraft Defogger},\nauthor={Gabriel Synnaeve and Zeming Lin and Jonas Gehring and Vasil Khalidov and Nicolas Carion and Nicolas Usunier},\nyear={2018},\nurl={https://openreview.net/forum?id=B1nxTzbRZ},\n}", "github": "[![github](/images/github_icon.svg) facebookresearch/starcraft_defogger](https://github.com/facebookresearch/starcraft_defogger)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=B1nxTzbRZ", "pdf_size": 0, "rating": "4;5;5", "confidence": "1;3;4", "rating_avg": 4.666666666666667, "confidence_avg": 2.6666666666666665, "replies_avg": 6, "authors#_avg": 6, "corr_rating_confidence": 0.9449111825230683, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5562179615762953081&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "B1p461b0W", "title": "Deep Learning is Robust to Massive Label Noise", "track": "main", "status": "Reject", "tldr": "We show that deep neural networks are able to learn from data that has been diluted by an arbitrary amount of noise.", "abstract": "Deep neural networks trained on large supervised datasets have led to impressive results in recent years. However, since well-annotated datasets can be prohibitively expensive and time-consuming to collect, recent work has explored the use of larger but noisy datasets that can be more easily obtained. In this paper, we investigate the behavior of deep neural networks on training sets with massively noisy labels. We show on multiple datasets such as MINST, CIFAR-10 and ImageNet that successful learning is possible even with an essentially arbitrary amount of noise. For example, on MNIST we find that accuracy of above 90 percent is still attainable even when the dataset has been diluted with 100 noisy examples for each clean example. Such behavior holds across multiple patterns of label noise, even when noisy labels are biased towards confusing classes. Further, we show how the required dataset size for successful training increases with higher label noise. Finally, we present simple actionable techniques for improving learning in the regime of high label noise.", "keywords": "label noise;weakly supervised learning;robustness of neural networks;deep learning;large datasets", "primary_area": "", "supplementary_material": "", "author": "David Rolnick;Andreas Veit;Serge Belongie;Nir Shavit", "authorids": "drolnick@mit.edu;av443@cornell.edu;sjb344@cornell.edu;shanir@csail.mit.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nrolnick2018deep,\ntitle={Deep Learning is Robust to Massive Label Noise},\nauthor={David Rolnick and Andreas Veit and Serge Belongie and Nir Shavit},\nyear={2018},\nurl={https://openreview.net/forum?id=B1p461b0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1p461b0W", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;4;5", "rating_avg": 4.666666666666667, "confidence_avg": 4.666666666666667, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 722, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5647581376755512521&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4 }, { "id": "B1spAqUp-", "title": "Pixel Deconvolutional Networks", "track": "main", "status": "Reject", "tldr": "Solve checkerboard problem in Deconvolutional layer by building dependencies between pixels", "abstract": "Deconvolutional layers have been widely used in a variety of deep\nmodels for up-sampling, including encoder-decoder networks for\nsemantic segmentation and deep generative models for unsupervised\nlearning. One of the key limitations of deconvolutional operations\nis that they result in the so-called checkerboard problem. This is\ncaused by the fact that no direct relationship exists among adjacent\npixels on the output feature map. To address this problem, we\npropose the pixel deconvolutional layer (PixelDCL) to establish\ndirect relationships among adjacent pixels on the up-sampled feature\nmap. Our method is based on a fresh interpretation of the regular\ndeconvolution operation. The resulting PixelDCL can be used to\nreplace any deconvolutional layer in a plug-and-play manner without\ncompromising the fully trainable capabilities of original models.\nThe proposed PixelDCL may result in slight decrease in efficiency,\nbut this can be overcome by an implementation trick. Experimental\nresults on semantic segmentation demonstrate that PixelDCL can\nconsider spatial features such as edges and shapes and yields more\naccurate segmentation outputs than deconvolutional layers. When used\nin image generation tasks, our PixelDCL can largely overcome the\ncheckerboard problem suffered by regular deconvolution operations.", "keywords": "Deep Learning;Deconvolutional Layer;Pixel CNN", "primary_area": "", "supplementary_material": "", "author": "Hongyang Gao;Hao Yuan;Zhengyang Wang;Shuiwang Ji", "authorids": "hongyang.gao@wsu.edu;hao.yuan@wsu.edu;zwang6@eecs.wsu.edu;sji@eecs.wsu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ngao2018pixel,\ntitle={Pixel Deconvolutional Networks},\nauthor={Hongyang Gao and Hao Yuan and Zhengyang Wang and Shuiwang Ji},\nyear={2018},\nurl={https://openreview.net/forum?id=B1spAqUp-},\n}", "github": "[![github](/images/github_icon.svg) divelab/PixelDCN](https://github.com/divelab/PixelDCN) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=B1spAqUp-)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1spAqUp-", "pdf_size": 0, "rating": "5;5;6", "confidence": "5;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8251888240772053601&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "B1suU-bAW", "title": "Learning Covariate-Specific Embeddings with Tensor Decompositions", "track": "main", "status": "Reject", "tldr": "Using the same embedding across covariates doesn't make sense, we show that a tensor decomposition algorithm learns sparse covariate-specific embeddings and naturally separable topics jointly and data-efficiently.", "abstract": "Word embedding is a useful approach to capture co-occurrence structures in a large corpus of text. In addition to the text data itself, we often have additional covariates associated with individual documents in the corpus---e.g. the demographic of the author, time and venue of publication, etc.---and we would like the embedding to naturally capture the information of the covariates. In this paper, we propose a new tensor decomposition model for word embeddings with covariates. Our model jointly learns a \\emph{base} embedding for all the words as well as a weighted diagonal transformation to model how each covariate modifies the base embedding. To obtain the specific embedding for a particular author or venue, for example, we can then simply multiply the base embedding by the transformation matrix associated with that time or venue. The main advantages of our approach is data efficiency and interpretability of the covariate transformation matrix. Our experiments demonstrate that our joint model learns substantially better embeddings conditioned on each covariate compared to the standard approach of learning a separate embedding for each covariate using only the relevant subset of data. Furthermore, our model encourages the embeddings to be ``topic-aligned'' in the sense that the dimensions have specific independent meanings. This allows our covariate-specific embeddings to be compared by topic, enabling downstream differential analysis. We empirically evaluate the benefits of our algorithm on several datasets, and demonstrate how it can be used to address many natural questions about the effects of covariates.", "keywords": "Word embedding;tensor decomposition", "primary_area": "", "supplementary_material": "", "author": "Kevin Tian;Teng Zhang;James Zou", "authorids": "kjtian@stanford.edu;tengz@stanford.edu;jamesz@stanford.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ntian2018learning,\ntitle={Learning Covariate-Specific Embeddings with Tensor Decompositions},\nauthor={Kevin Tian and Teng Zhang and James Zou},\nyear={2018},\nurl={https://openreview.net/forum?id=B1suU-bAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1suU-bAW", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;3;5", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:JxtrGultnZUJ:scholar.google.com/&scioq=Learning+Covariate-Specific+Embeddings+with+Tensor+Decompositions&hl=en&as_sdt=0,14", "gs_version_total": 0 }, { "id": "B1tC-LT6W", "title": "Trace norm regularization and faster inference for embedded speech recognition RNNs", "track": "main", "status": "Reject", "tldr": "We compress and speed up speech recognition models on embedded devices through a trace norm regularization technique and optimized kernels.", "abstract": "We propose and evaluate new techniques for compressing and speeding up dense matrix multiplications as found in the fully connected and recurrent layers of neural networks for embedded large vocabulary continuous speech recognition (LVCSR). For compression, we introduce and study a trace norm regularization technique for training low rank factored versions of matrix multiplications. Compared to standard low rank training, we show that our method leads to good accuracy versus number of parameter trade-offs and can be used to speed up training of large models. For speedup, we enable faster inference on ARM processors through new open sourced kernels optimized for small batch sizes, resulting in 3x to 7x speed ups over the widely used gemmlowp library. Beyond LVCSR, we expect our techniques and kernels to be more generally applicable to embedded neural networks with large fully connected or recurrent layers.", "keywords": "LVCSR;speech recognition;embedded;low rank factorization;RNN;GRU;trace norm", "primary_area": "", "supplementary_material": "", "author": "Markus Kliegl;Siddharth Goyal;Kexin Zhao;Kavya Srinet;Mohammad Shoeybi", "authorids": "mkliegl@gmail.com;goyalsiddharth@baidu.com;zhaokexin01@baidu.com;srinetkavya@baidu.com;shoeybim@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nkliegl2018trace,\ntitle={Trace norm regularization and faster inference for embedded speech recognition {RNN}s},\nauthor={Markus Kliegl and Siddharth Goyal and Kexin Zhao and Kavya Srinet and Mohammad Shoeybi},\nyear={2018},\nurl={https://openreview.net/forum?id=B1tC-LT6W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1tC-LT6W", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;3;5", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 5, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1995424622689014469&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "B1tExikAW", "title": "LatentPoison -- Adversarial Attacks On The Latent Space", "track": "main", "status": "Reject", "tldr": "Adversarial attacks on the latent space of variational autoencoders to change the semantic meaning of inputs", "abstract": "Robustness and security of machine learning (ML) systems are intertwined, wherein a non-robust ML system (classifiers, regressors, etc.) can be subject to attacks using a wide variety of exploits. With the advent of scalable deep learning methodologies, a lot of emphasis has been put on the robustness of supervised, unsupervised and reinforcement learning algorithms. Here, we study the robustness of the latent space of a deep variational autoencoder (dVAE), an unsupervised generative framework, to show that it is indeed possible to perturb the latent space, flip the class predictions and keep the classification probability approximately equal before and after an attack. This means that an agent that looks at the outputs of a decoder would remain oblivious to an attack.", "keywords": "adversarial attacks;security;auto-encoder", "primary_area": "", "supplementary_material": "", "author": "Antonia Creswell;Biswa Sengupta;Anil A. Bharath", "authorids": "ac2211@ic.ac.uk;b.sengupta@imperial.ac.uk;a.bharath@imperial.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ncreswell2018latentpoison,\ntitle={LatentPoison -- Adversarial Attacks On The Latent Space},\nauthor={Antonia Creswell and Biswa Sengupta and Anil A. Bharath},\nyear={2018},\nurl={https://openreview.net/forum?id=B1tExikAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1tExikAW", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;3", "rating_avg": 4.0, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11375532360829030144&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "B1twdMCab", "title": "Dynamic Integration of Background Knowledge in Neural NLU Systems", "track": "main", "status": "Reject", "tldr": "In this paper we present a task-agnostic reading architecture for the dynamic integration of explicit background knowledge in neural NLU models. ", "abstract": "Common-sense or background knowledge is required to understand natural language, but in most neural natural language understanding (NLU) systems, the requisite background knowledge is indirectly acquired from static corpora. We develop a new reading architecture for the dynamic integration of explicit background knowledge in NLU models. A new task-agnostic reading module provides refined word representations to a task-specific NLU architecture by processing background knowledge in the form of free-text statements, together with the task-specific inputs. Strong performance on the tasks of document question answering (DQA) and recognizing textual entailment (RTE) demonstrate the effectiveness and flexibility of our approach. Analysis shows that our models learn to exploit knowledge selectively and in a semantically appropriate way.", "keywords": "natural language processing;background knowledge;word embeddings;question answering;natural language inference", "primary_area": "", "supplementary_material": "", "author": "Dirk Weissenborn;Tomas Kocisky;Chris Dyer", "authorids": "dirk.weissenborn@dfki.de;tkocisky@google.com;cdyer@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nweissenborn2018dynamic,\ntitle={Dynamic Integration of Background Knowledge in Neural {NLU} Systems},\nauthor={Dirk Weissenborn and Tomas Kocisky and Chris Dyer},\nyear={2018},\nurl={https://openreview.net/forum?id=B1twdMCab},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1twdMCab", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;3;4", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7282061314613697176&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "B1uvH_gC-", "title": "Parametric Manifold Learning Via Sparse Multidimensional Scaling", "track": "main", "status": "Reject", "tldr": "Parametric Manifold Learning with Neural Networks in a Geometric Framework ", "abstract": "We propose a metric-learning framework for computing distance-preserving maps that generate low-dimensional embeddings for a certain class of manifolds. We employ Siamese networks to solve the problem of least squares multidimensional scaling for generating mappings that preserve geodesic distances on the manifold. In contrast to previous parametric manifold learning methods we show a substantial reduction in training effort enabled by the computation of geodesic distances in a farthest point sampling strategy. Additionally, the use of a network to model the distance-preserving map reduces the complexity of the multidimensional scaling problem and leads to an improved non-local generalization of the manifold compared to analogous non-parametric counterparts. We demonstrate our claims on point-cloud data and on image manifolds and show a numerical analysis of our technique to facilitate a greater understanding of the representational power of neural networks in modeling manifold data.", "keywords": "Manifold Learning;Non-linear Dimensionality Reduction;Neural Networks;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Gautam Pai;Ronen Talmon;Ron Kimmel", "authorids": "paigautam@cs.technion.ac.il;ronen@ef.technion.ac.il;ron@cs.technion.ac.il", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\npai2018parametric,\ntitle={Parametric Manifold Learning Via Sparse Multidimensional Scaling},\nauthor={Gautam Pai and Ronen Talmon and Ron Kimmel},\nyear={2018},\nurl={https://openreview.net/forum?id=B1uvH_gC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=B1uvH_gC-", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;5", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13295108880896623795&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "B1wN2f2-G", "title": "Attribute-aware Collaborative Filtering: Survey and Classification", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Attribute-aware CF models aims at rating prediction given not only the historical rating from users to items, but also the information associated with users (e.g. age), items (e.g. price), or even ratings (e.g. rating time). This paper surveys\nworks in the past decade developing attribute-aware CF systems, and discovered that mathematically they can be classified into four different categories. We provide the readers not only the high level mathematical interpretation of the existing\nworks in this area but also the mathematical insight for each category of models. Finally we provide our preliminary experiment results comparing the effectiveness of the major works in each category.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wen-Hao Chen;Chin-Chi Hsu;Mi-Yen Yeh;Shou-De Lin", "authorids": "b02902023@ntu.edu.tw;chinchi@iis.sinica.edu.tw;miyen@iis.sinica.edu.tw;sdlin@csie.ntu.edu.tw", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1wN2f2-G", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;4;5", "rating_avg": 4.666666666666667, "confidence_avg": 4.666666666666667, "replies_avg": 3, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17411323805492956918&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "B1ydPgTpW", "title": "Predicting Auction Price of Vehicle License Plate with Deep Recurrent Neural Network", "track": "main", "status": "Reject", "tldr": "Predicting auction price of vehicle license plates in Hong Kong with deep recurrent neural network, based on the characters on the plates.", "abstract": "In Chinese societies, superstition is of paramount importance, and vehicle license plates with desirable numbers can fetch very high prices in auctions. Unlike other valuable items, license plates are not allocated an estimated price before auction. \n\nI propose that the task of predicting plate prices can be viewed as a natural language processing (NLP) task, as the value depends on the meaning of each individual character on the plate and its semantics. I construct a deep recurrent neural network (RNN) to predict the prices of vehicle license plates in Hong Kong, based on the characters on a plate. I demonstrate the importance of having a deep network and of retraining. Evaluated on 13 years of historical auction prices, the deep RNN's predictions can explain over 80 percent of price variations, outperforming previous models by a significant margin. I also demonstrate how the model can be extended to become a search engine for plates and to provide estimates of the expected price distribution.", "keywords": "price predictions;expert system;recurrent neural networks;deep learning;natural language processing", "primary_area": "", "supplementary_material": "", "author": "Vinci Chow", "authorids": "vincichow@cuhk.edu.hk", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nchow2018predicting,\ntitle={Predicting Auction Price of Vehicle License Plate with Deep Recurrent Neural Network},\nauthor={Vinci Chow},\nyear={2018},\nurl={https://openreview.net/forum?id=B1ydPgTpW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1ydPgTpW", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;4;5", "rating_avg": 4.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 1, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=603332207241541052&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "title": "Large Scale Optimal Transport and Mapping Estimation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/179", "id": "B1zlp1bRW", "author_site": "Vivien Seguy, Bharath Bhushan Damodaran, R\u00e9mi Flamary, Nicolas Courty, Antoine Rolet, Mathieu Blondel", "tldr": "Learning optimal mapping with deepNN between distributions along with theoretical guarantees.", "abstract": "This paper presents a novel two-step approach for the fundamental problem of learning an optimal map from one distribution to another. First, we learn an optimal transport (OT) plan, which can be thought as a one-to-many map between the two distributions. To that end, we propose a stochastic dual approach of regularized OT, and show empirically that it scales better than a recent related approach when the amount of samples is very large. Second, we estimate a Monge map as a deep neural network learned by approximating the barycentric projection of the previously-obtained OT plan. This parameterization allows generalization of the mapping outside the support of the input measure. We prove two theoretical stability results of regularized OT which show that our estimations converge to the OT and Monge map between the underlying continuous measures. We showcase our proposed approach on two applications: domain adaptation and generative modeling.", "keywords": "optimal transport;Wasserstein;domain adaptation;generative models;Monge map;optimal mapping", "primary_area": "", "supplementary_material": "", "author": "Vivien Seguy;Bharath Bhushan Damodaran;Remi Flamary;Nicolas Courty;Antoine Rolet;Mathieu Blondel", "authorids": "vivienseguy@gmail.com;bharath-bhushan.damodaran@irisa.fr;remi.flamary@unice.fr;courty@univ-ubs.fr;antoine.rolet@iip.ist.i.kyoto-u.ac.jp;mblondel@gmail.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nseguy2018large,\ntitle={Large Scale Optimal Transport and Mapping Estimation},\nauthor={Vivien Seguy and Bharath Bhushan Damodaran and Remi Flamary and Nicolas Courty and Antoine Rolet and Mathieu Blondel},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=B1zlp1bRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7;8", "confidence": "3;3;3;3", "rating_avg": 6.75, "confidence_avg": 3.0, "replies_avg": 14, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 291, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=513370095015629698&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=B1zlp1bRW", "pdf": "https://openreview.net/pdf?id=B1zlp1bRW", "email": ";;;;;", "author_num": 6 }, { "title": "Learning Differentially Private Recurrent Language Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/187", "id": "BJ0hF1Z0b", "author_site": "H. Brendan McMahan, Daniel Ramage, Kunal Talwar, Li Zhang", "tldr": "User-level differential privacy for recurrent neural network language models is possible with a sufficiently large dataset.", "abstract": "We demonstrate that it is possible to train large recurrent language models with user-level differential privacy guarantees with only a negligible cost in predictive accuracy. Our work builds on recent advances in the training of deep networks on user-partitioned data and privacy accounting for stochastic gradient descent. In particular, we add user-level privacy protection to the federated averaging algorithm, which makes large step updates from user-level data. Our work demonstrates that given a dataset with a sufficiently large number of users (a requirement easily met by even small internet-scale datasets), achieving differential privacy comes at the cost of increased computation, rather than in decreased utility as in most prior work. We find that our private LSTM language models are quantitatively and qualitatively similar to un-noised models when trained on a large dataset.", "keywords": "differential privacy;LSTMs;language models;privacy", "primary_area": "", "supplementary_material": "", "author": "H. Brendan McMahan;Daniel Ramage;Kunal Talwar;Li Zhang", "authorids": "mcmahan@google.com;dramage@google.com;kunal@google.com;liqzhang@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nbrendan2018learning,\ntitle={Learning Differentially Private Recurrent Language Models},\nauthor={H. Brendan McMahan and Daniel Ramage and Kunal Talwar and Li Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BJ0hF1Z0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "7;7;8", "confidence": "2;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1654, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1102128856283131840&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=BJ0hF1Z0b", "pdf": "https://openreview.net/pdf?id=BJ0hF1Z0b", "email": ";;;", "author_num": 4 }, { "id": "BJ4prNx0W", "title": "Learning what to learn in a neural program", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning programs with neural networks is a challenging task, addressed by a long line of existing work. It is difficult to learn neural networks which will generalize to problem instances that are much larger than those used during training. Furthermore, even when the learned neural program empirically works on all test inputs, we cannot verify that it will work on every possible input. Recent work has shown that it is possible to address these issues by using recursion in the Neural Programmer-Interpreter, but this technique requires a verification set which is difficult to construct without knowledge of the internals of the oracle used to generate training data. In this work, we show how to automatically build such a verification set, which can also be directly used for training. By interactively querying an oracle, we can construct this set with minimal additional knowledge about the oracle. We empirically demonstrate that our method allows automated learning and verification of a recursive NPI program with provably perfect generalization.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Richard Shin;Dawn Song", "authorids": "ricshin@berkeley.edu;dawnsong.travel@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nshin2018learning,\ntitle={Learning what to learn in a neural program},\nauthor={Richard Shin and Dawn Song},\nyear={2018},\nurl={https://openreview.net/forum?id=BJ4prNx0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BJ4prNx0W", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;2", "rating_avg": 4.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": -0.4999999999999999, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "BJ6anzb0Z", "title": "Multimodal Sentiment Analysis To Explore the Structure of Emotions", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a novel approach to multimodal sentiment analysis using deep neural\nnetworks combining visual recognition and natural language processing. Our\ngoal is different than the standard sentiment analysis goal of predicting whether\na sentence expresses positive or negative sentiment; instead, we aim to infer the\nlatent emotional state of the user. Thus, we focus on predicting the emotion word\ntags attached by users to their Tumblr posts, treating these as \u201cself-reported emotions.\u201d\nWe demonstrate that our multimodal model combining both text and image\nfeatures outperforms separate models based solely on either images or text. Our\nmodel\u2019s results are interpretable, automatically yielding sensible word lists associated\nwith emotions. We explore the structure of emotions implied by our model\nand compare it to what has been posited in the psychology literature, and validate\nour model on a set of images that have been used in psychology studies. Finally,\nour work also provides a useful tool for the growing academic study of images\u2014\nboth photographs and memes\u2014on social networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anthony Hu;Seth Flaxman", "authorids": "anthony.hu@stats.ox.ac.uk;s.flaxman@imperial.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nhu2018multimodal,\ntitle={Multimodal Sentiment Analysis To Explore the Structure of Emotions},\nauthor={Anthony Hu and Seth Flaxman},\nyear={2018},\nurl={https://openreview.net/forum?id=BJ6anzb0Z},\n}", "github": "[![github](/images/github_icon.svg) anthonyhu/tumblr-emotions](https://github.com/anthonyhu/tumblr-emotions) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=BJ6anzb0Z)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BJ6anzb0Z", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;5;5", "rating_avg": 5.0, "confidence_avg": 5.0, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 144, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12459522581185705674&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "BJ78bJZCZ", "title": "Efficiently applying attention to sequential data with the Recurrent Discounted Attention unit", "track": "main", "status": "Reject", "tldr": "We introduce the Recurrent Discounted Unit which applies attention to any length sequence in linear time", "abstract": "Recurrent Neural Networks architectures excel at processing sequences by\nmodelling dependencies over different timescales. The recently introduced\nRecurrent Weighted Average (RWA) unit captures long term dependencies\nfar better than an LSTM on several challenging tasks. The RWA achieves\nthis by applying attention to each input and computing a weighted average\nover the full history of its computations. Unfortunately, the RWA cannot\nchange the attention it has assigned to previous timesteps, and so struggles\nwith carrying out consecutive tasks or tasks with changing requirements.\nWe present the Recurrent Discounted Attention (RDA) unit that builds on\nthe RWA by additionally allowing the discounting of the past.\nWe empirically compare our model to RWA, LSTM and GRU units on\nseveral challenging tasks. On tasks with a single output the RWA, RDA and\nGRU units learn much quicker than the LSTM and with better performance.\nOn the multiple sequence copy task our RDA unit learns the task three\ntimes as quickly as the LSTM or GRU units while the RWA fails to learn at\nall. On the Wikipedia character prediction task the LSTM performs best\nbut it followed closely by our RDA unit. Overall our RDA unit performs\nwell and is sample efficient on a large variety of sequence tasks.", "keywords": "RNNs", "primary_area": "", "supplementary_material": "", "author": "Brendan Maginnis;Pierre Richemond", "authorids": "brendan.maginnis@gmail.com;pierre.richemond@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nmaginnis2018efficiently,\ntitle={Efficiently applying attention to sequential data with the Recurrent Discounted Attention unit},\nauthor={Brendan Maginnis and Pierre Richemond},\nyear={2018},\nurl={https://openreview.net/forum?id=BJ78bJZCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BJ78bJZCZ", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;5;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": -0.18898223650461357, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2272792597311092888&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "BJ7d0fW0b", "title": "Faster Reinforcement Learning with Expert State Sequences", "track": "main", "status": "Reject", "tldr": "", "abstract": "Imitation learning relies on expert demonstrations. Existing approaches often re- quire that the complete demonstration data, including sequences of actions and states are available. In this paper, we consider a realistic and more difficult sce- nario where a reinforcement learning agent only has access to the state sequences of an expert, while the expert actions are not available. Inferring the unseen ex- pert actions in a stochastic environment is challenging and usually infeasible when combined with a large state space. We propose a novel policy learning method which only utilizes the expert state sequences without inferring the unseen ac- tions. Specifically, our agent first learns to extract useful sub-goal information from the state sequences of the expert and then utilizes the extracted sub-goal information to factorize the action value estimate over state-action pairs and sub- goals. The extracted sub-goals are also used to synthesize guidance rewards in the policy learning. We evaluate our agent on five Doom tasks. Our empirical results show that the proposed method significantly outperforms the conventional DQN method.", "keywords": "Reinforcement Learning;Imitation Learning", "primary_area": "", "supplementary_material": "", "author": "Xiaoxiao Guo;Shiyu Chang;Mo Yu;Miao Liu;Gerald Tesauro", "authorids": "xiaoxiao.guo@ibm.com;shiyu.chang@ibm.com;;;", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nguo2018faster,\ntitle={Faster Reinforcement Learning with Expert State Sequences},\nauthor={Xiaoxiao Guo and Shiyu Chang and Mo Yu and Miao Liu and Gerald Tesauro},\nyear={2018},\nurl={https://openreview.net/forum?id=BJ7d0fW0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BJ7d0fW0b", "pdf_size": 0, "rating": "5;6;6", "confidence": "5;4;3", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 5, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:shVW70HPYEIJ:scholar.google.com/&scioq=Faster+Reinforcement+Learning+with+Expert+State+Sequences&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Auto-Encoding Sequential Monte Carlo", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/31", "id": "BJ8c3f-0b", "author_site": "Tuan Anh Le, Maximilian Igl, Tom Rainforth, Tom Jin, Frank Wood", "tldr": "We build on auto-encoding sequential Monte Carlo, gain new theoretical insights and develop an improved training procedure based on those insights.", "abstract": "We build on auto-encoding sequential Monte Carlo (AESMC): a method for model and proposal learning based on maximizing the lower bound to the log marginal likelihood in a broad family of structured probabilistic models. Our approach relies on the efficiency of sequential Monte Carlo (SMC) for performing inference in structured probabilistic models and the flexibility of deep neural networks to model complex conditional probability distributions. We develop additional theoretical insights and introduce a new training procedure which improves both model and proposal learning. We demonstrate that our approach provides a fast, easy-to-implement and scalable means for simultaneous model learning and proposal adaptation in deep generative models.", "keywords": "Variational Autoencoders;Inference amortization;Model learning;Sequential Monte Carlo;ELBOs", "primary_area": "", "supplementary_material": "", "author": "Tuan Anh Le;Maximilian Igl;Tom Rainforth;Tom Jin;Frank Wood", "authorids": "tuananh@robots.ox.ac.uk;maximilian.igl@gmail.com;twgr@robots.ox.ac.uk;tom@jin.me.uk;fwood@robots.ox.ac.uk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nanh2018autoencoding,\ntitle={Auto-Encoding Sequential Monte Carlo},\nauthor={Tuan Anh Le and Maximilian Igl and Tom Rainforth and Tom Jin and Frank Wood},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BJ8c3f-0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "3;7;7", "confidence": "2;4;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.0, "replies_avg": 11, "authors#_avg": 5, "corr_rating_confidence": 0.8660254037844387, "gs_citation": 201, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1610514733168322441&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=BJ8c3f-0b", "pdf": "https://openreview.net/pdf?id=BJ8c3f-0b", "email": ";;;;", "author_num": 5 }, { "id": "BJ8lbVAfz", "title": "Self-Organization adds application robustness to deep learners", "track": "main", "status": "Withdraw", "tldr": "integration of self-organization and supervised learning in a hierarchical neural network", "abstract": "While self-organizing principles have motivated much of early learning models, such principles have rarely been included in deep learning architectures. Indeed, from a supervised learning perspective it seems that topographic constraints are rather decremental to optimal performance. Here we study a network model that incorporates self-organizing maps into a supervised network and show how gradient learning results in a form of a self-organizing learning rule. Moreover, we show that such a model is robust in the sense of its application to a variety of areas, which is believed to be a hallmark of biological learning systems. ", "keywords": "supervised learning;unsupervised learning;self-organization;internal representation;topological structure", "primary_area": "", "supplementary_material": "", "author": "Pitoyo Hartono;Thomas Trappenberg", "authorids": "hartono@ieee.org;tt@cs.dal.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BJ8lbVAfz", "pdf_size": 0, "rating": "2;2;4", "confidence": "5;5;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.666666666666667, "replies_avg": 3, "authors#_avg": 2, "corr_rating_confidence": -1.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6812633184516511936&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Synthetic and Natural Noise Both Break Neural Machine Translation", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/172", "id": "BJ8vJebC-", "author_site": "Yonatan Belinkov, Yonatan Bisk", "tldr": "CharNMT is brittle", "abstract": "Character-based neural machine translation (NMT) models alleviate out-of-vocabulary issues, learn morphology, and move us closer to completely end-to-end translation systems. Unfortunately, they are also very brittle and easily falter when presented with noisy data. In this paper, we confront NMT models with synthetic and natural sources of noise. We find that state-of-the-art models fail to translate even moderately noisy texts that humans have no trouble comprehending. We explore two approaches to increase model robustness: structure-invariant word representations and robust training on noisy texts. We find that a model based on a character convolutional neural network is able to simultaneously learn representations robust to multiple kinds of noise. ", "keywords": "neural machine translation;characters;noise;adversarial examples;robust training", "primary_area": "", "supplementary_material": "", "author": "Yonatan Belinkov;Yonatan Bisk", "authorids": "belinkov@mit.edu;ybisk@yonatanbisk.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nbelinkov2018synthetic,\ntitle={Synthetic and Natural Noise Both Break Neural Machine Translation},\nauthor={Yonatan Belinkov and Yonatan Bisk},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BJ8vJebC-},\n}", "github": "[![github](/images/github_icon.svg) ybisk/charNMT-noise](https://github.com/ybisk/charNMT-noise) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=BJ8vJebC-)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 893, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10493132199224079445&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=BJ8vJebC-", "pdf": "https://openreview.net/pdf?id=BJ8vJebC-", "email": ";", "author_num": 2 }, { "id": "BJB7fkWR-", "title": "Domain Adaptation for Deep Reinforcement Learning in Visually Distinct Games", "track": "main", "status": "Reject", "tldr": "An approach to learning a shared embedding space between visually distinct games.", "abstract": "Many deep reinforcement learning approaches use graphical state representations,\nthis means visually distinct games that share the same underlying structure cannot\neffectively share knowledge. This paper outlines a new approach for learning\nunderlying game state embeddings irrespective of the visual rendering of the game\nstate. We utilise approaches from multi-task learning and domain adaption in\norder to place visually distinct game states on a shared embedding manifold. We\npresent our results in the context of deep reinforcement learning agents.", "keywords": "Deep Reinforcement Learning;Domain Adaptation;Adversarial Networks", "primary_area": "", "supplementary_material": "", "author": "Dino S. Ratcliffe;Luca Citi;Sam Devlin;Udo Kruschwitz", "authorids": "d.ratcliffe@qmul.ac.uk;lciti@essex.ac.uk;sam.devlin@york.ac.uk;udo@essex.ac.uk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ns.2018domain,\ntitle={Domain Adaptation for Deep Reinforcement Learning in Visually Distinct Games},\nauthor={Dino S. Ratcliffe and Luca Citi and Sam Devlin and Udo Kruschwitz},\nyear={2018},\nurl={https://openreview.net/forum?id=BJB7fkWR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BJB7fkWR-", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;3;5", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": 0.5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8639561564875386157&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "BJDEbngCZ", "title": "Global Convergence of Policy Gradient Methods for Linearized Control Problems", "track": "main", "status": "Reject", "tldr": "This paper shows that model-free policy gradient methods can converge to the global optimal solution for non-convex linearized control problems.", "abstract": "Direct policy gradient methods for reinforcement learning and continuous control problems are a popular\napproach for a variety of reasons: \n1) they are easy to implement without explicit knowledge of the underlying model;\n2) they are an \"end-to-end\" approach, directly optimizing the performance metric of interest;\n3) they inherently allow for richly parameterized policies.\nA notable drawback is that even in the most basic continuous control problem (that of linear quadratic regulators), these methods must solve a non-convex optimization problem, where little is understood about their efficiency from both computational and statistical perspectives. In contrast, system identification and model based planning in optimal control theory have a much more solid theoretical footing, where much is known with regards to their computational and statistical properties. This work bridges this gap showing that (model free) policy gradient methods globally converge to the optimal solution and are efficient (polynomially so in relevant problem dependent quantities) with regards to their sample and computational complexities. ", "keywords": "linear quadratic regulator;policy gradient;natural gradient;reinforcement learning;non-convex optimization", "primary_area": "", "supplementary_material": "", "author": "Maryam Fazel;Rong Ge;Sham M. Kakade;Mehran Mesbahi", "authorids": "mfazel@uw.edu;rongge@cs.duke.edu;sham@cs.washington.edu;mesbahi@aa.washington.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nfazel2018global,\ntitle={Global Convergence of Policy Gradient Methods for Linearized Control Problems},\nauthor={Maryam Fazel and Rong Ge and Sham M. Kakade and Mehran Mesbahi},\nyear={2018},\nurl={https://openreview.net/forum?id=BJDEbngCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BJDEbngCZ", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;3;4", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=845569123523486360&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "BJDH5M-AW", "title": "Synthesizing Robust Adversarial Examples", "track": "main", "status": "Reject", "tldr": "We introduce a new method for synthesizing adversarial examples robust in the physical world and use it to fabricate the first 3D adversarial objects.", "abstract": "Neural network-based classifiers parallel or exceed human-level accuracy on many common tasks and are used in practical systems. Yet, neural networks are susceptible to adversarial examples, carefully perturbed inputs that cause networks to misbehave in arbitrarily chosen ways. When generated with standard methods, these examples do not consistently fool a classifier in the physical world due to a combination of viewpoint shifts, camera noise, and other natural transformations. Adversarial examples generated using standard techniques require complete control over direct input to the classifier, which is impossible in many real-world systems.\n\nWe introduce the first method for constructing real-world 3D objects that consistently fool a neural network across a wide distribution of angles and viewpoints. We present a general-purpose algorithm for generating adversarial examples that are robust across any chosen distribution of transformations. We demonstrate its application in two dimensions, producing adversarial images that are robust to noise, distortion, and affine transformation. Finally, we apply the algorithm to produce arbitrary physical 3D-printed adversarial objects, demonstrating that our approach works end-to-end in the real world. Our results show that adversarial examples are a practical concern for real-world systems.\n", "keywords": "adversarial examples", "primary_area": "", "supplementary_material": "", "author": "Anish Athalye;Logan Engstrom;Andrew Ilyas;Kevin Kwok", "authorids": "aathalye@mit.edu;engstrom@mit.edu;ailyas@mit.edu;kevink16@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nathalye2018synthesizing,\ntitle={Synthesizing Robust Adversarial Examples},\nauthor={Anish Athalye and Logan Engstrom and Andrew Ilyas and Kevin Kwok},\nyear={2018},\nurl={https://openreview.net/forum?id=BJDH5M-AW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BJDH5M-AW", "pdf_size": 0, "rating": "5;6;8", "confidence": "4;4;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 21, "authors#_avg": 4, "corr_rating_confidence": -0.944911182523068, "gs_citation": 2119, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4844547796895961001&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "title": "CausalGAN: Learning Causal Implicit Generative Models with Adversarial Training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/159", "id": "BJE-4xW0W", "author_site": "Murat Kocaoglu, Christopher Snyder, Alexandros Dimakis, Sriram Vishwanath", "tldr": "We introduce causal implicit generative models, which can sample from conditional and interventional distributions and also propose two new conditional GANs which we use for training them.", "abstract": "We introduce causal implicit generative models (CiGMs): models that allow sampling from not only the true observational but also the true interventional distributions. We show that adversarial training can be used to learn a CiGM, if the generator architecture is structured based on a given causal graph. We consider the application of conditional and interventional sampling of face images with binary feature labels, such as mustache, young. We preserve the dependency structure between the labels with a given causal graph. We devise a two-stage procedure for learning a CiGM over the labels and the image. First we train a CiGM over the binary labels using a Wasserstein GAN where the generator neural network is consistent with the causal graph between the labels. Later, we combine this with a conditional GAN to generate images conditioned on the binary labels. We propose two new conditional GAN architectures: CausalGAN and CausalBEGAN. We show that the optimal generator of the CausalGAN, given the labels, samples from the image distributions conditioned on these labels. The conditional GAN combined with a trained CiGM for the labels is then a CiGM over the labels and the generated image. We show that the proposed architectures can be used to sample from observational and interventional image distributions, even for interventions which do not naturally occur in the dataset.", "keywords": "causality;structural causal models;GANs;conditional GANs;BEGAN;adversarial training", "primary_area": "", "supplementary_material": "", "author": "Murat Kocaoglu;Christopher Snyder;Alexandros G. Dimakis;Sriram Vishwanath", "authorids": "mkocaoglu@utexas.edu;22csnyder@gmail.com;dimakis@austin.utexas.edu;sriram@austin.utexas.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nkocaoglu2018causalgan,\ntitle={Causal{GAN}: Learning Causal Implicit Generative Models with Adversarial Training},\nauthor={Murat Kocaoglu and Christopher Snyder and Alexandros G. Dimakis and Sriram Vishwanath},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BJE-4xW0W},\n}", "github": "[![github](/images/github_icon.svg) mkocaoglu/CausalGAN](https://github.com/mkocaoglu/CausalGAN) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=BJE-4xW0W)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;9", "confidence": "3;3;3", "rating_avg": 7.333333333333333, "confidence_avg": 3.0, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 314, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16773515662718074217&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=BJE-4xW0W", "pdf": "https://openreview.net/pdf?id=BJE-4xW0W", "email": ";;;", "author_num": 4 }, { "title": "Critical Percolation as a Framework to Analyze the Training of Deep Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/303", "id": "BJGWO9k0Z", "author_site": "Zohar Ringel, Rodrigo Andrade de Bem", "tldr": "A toy dataset based on critical percolation in a planar graph provides an analytical window to the training dynamics of deep neural networks ", "abstract": "In this paper we approach two relevant deep learning topics: i) tackling of graph structured input data and ii) a better understanding and analysis of deep networks and related learning algorithms. With this in mind we focus on the topological classification of reachability in a particular subset of planar graphs (Mazes). Doing so, we are able to model the topology of data while staying in Euclidean space, thus allowing its processing with standard CNN architectures. We suggest a suitable architecture for this problem and show that it can express a perfect solution to the classification task. The shape of the cost function around this solution is also derived and, remarkably, does not depend on the size of the maze in the large maze limit. Responsible for this behavior are rare events in the dataset which strongly regulate the shape of the cost function near this global minimum. We further identify an obstacle to learning in the form of poorly performing local minima in which the network chooses to ignore some of the inputs. We further support our claims with training experiments and numerical analysis of the cost function on networks with up to $128$ layers.", "keywords": "Deep Convolutional Networks;Loss function landscape;Graph Structured Data;Training Complexity;Theory of deep learning;Percolation theory;Anderson Localization", "primary_area": "", "supplementary_material": "", "author": "Zohar Ringel;Rodrigo Andrade de Bem", "authorids": "zoharahoz@gmail.com;rodrigo.bem@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nringel2018critical,\ntitle={Critical Percolation as a Framework to Analyze the Training of Deep Networks},\nauthor={Zohar Ringel and Rodrigo Andrade de Bem},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BJGWO9k0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "1;3;3", "rating_avg": 6.666666666666667, "confidence_avg": 2.3333333333333335, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ucoUHIXdtIkJ:scholar.google.com/&scioq=Critical+Percolation+as+a+Framework+to+Analyze+the+Training+of+Deep+Networks&hl=en&as_sdt=0,5", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=BJGWO9k0Z", "pdf": "https://openreview.net/pdf?id=BJGWO9k0Z", "email": ";", "author_num": 2 }, { "title": "FusionNet: Fusing via Fully-aware Attention with Application to Machine Comprehension", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/246", "id": "BJIgi_eCZ", "author_site": "Hsin-Yuan Huang, Chenguang Zhu, Yelong Shen, Weizhu Chen", "tldr": "We propose a light-weight enhancement for attention and a neural architecture, FusionNet, to achieve SotA on SQuAD and adversarial SQuAD.", "abstract": "This paper introduces a new neural structure called FusionNet, which extends existing attention approaches from three perspectives. First, it puts forward a novel concept of \"History of Word\" to characterize attention information from the lowest word-level embedding up to the highest semantic-level representation. Second, it identifies an attention scoring function that better utilizes the \"history of word\" concept. Third, it proposes a fully-aware multi-level attention mechanism to capture the complete information in one text (such as a question) and exploit it in its counterpart (such as context or passage) layer by layer. We apply FusionNet to the Stanford Question Answering Dataset (SQuAD) and it achieves the first position for both single and ensemble model on the official SQuAD leaderboard at the time of writing (Oct. 4th, 2017). Meanwhile, we verify the generalization of FusionNet with two adversarial SQuAD datasets and it sets up the new state-of-the-art on both datasets: on AddSent, FusionNet increases the best F1 metric from 46.6% to 51.4%; on AddOneSent, FusionNet boosts the best F1 metric from 56.0% to 60.7%.", "keywords": "Attention Mechanism;Machine Comprehension;Natural Language Processing;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Hsin-Yuan Huang;Chenguang Zhu;Yelong Shen;Weizhu Chen", "authorids": "momohuang@gmail.com;chezhu@microsoft.com;yeshen@microsoft.com;wzchen@microsoft.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nhuang2018fusionnet,\ntitle={FusionNet: Fusing via Fully-aware Attention with Application to Machine Comprehension},\nauthor={Hsin-Yuan Huang and Chenguang Zhu and Yelong Shen and Weizhu Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BJIgi_eCZ},\n}", "github": "[![github](/images/github_icon.svg) momohuang/FusionNet-NLI](https://github.com/momohuang/FusionNet-NLI) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=BJIgi_eCZ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "5;4;3", "rating_avg": 7.333333333333333, "confidence_avg": 4.0, "replies_avg": 28, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 209, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17073455781225282077&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=BJIgi_eCZ", "pdf": "https://openreview.net/pdf?id=BJIgi_eCZ", "email": ";;;", "author_num": 4 }, { "id": "BJInEZsTb", "title": "Learning Representations and Generative Models for 3D Point Clouds", "track": "main", "status": "Workshop", "tldr": "Deep autoencoders to learn a good representation for geometric 3D point-cloud data; Generative models for point clouds.", "abstract": "Three-dimensional geometric data offer an excellent domain for studying representation learning and generative modeling. In this paper, we look at geometric data represented as point clouds. We introduce a deep autoencoder (AE) network with excellent reconstruction quality and generalization ability. The learned representations outperform the state of the art in 3D recognition tasks and enable basic shape editing applications via simple algebraic manipulations, such as semantic part editing, shape analogies and shape interpolation. We also perform a thorough study of different generative models including GANs operating on the raw point clouds, significantly improved GANs trained in the fixed latent space our AEs and, Gaussian mixture models (GMM). Interestingly, GMMs trained in the latent space of our AEs produce samples of the best fidelity and diversity.\nTo perform our quantitative evaluation of generative models, we propose simple measures of fidelity and diversity based on optimally matching between sets point clouds.", "keywords": "representation learning;auto-encoders;3D point clouds;generative models;GANs;Gaussian Mixture Models", "primary_area": "", "supplementary_material": "", "author": "Panos Achlioptas;Olga Diamanti;Ioannis Mitliagkas;Leonidas Guibas", "authorids": "optas@cs.stanford.edu;diamanti@stanford.edu;ioannis@iro.umontreal.ca;guibas@cs.stanford.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nachlioptas2018learning,\ntitle={Learning Representations and Generative Models for 3D Point Clouds},\nauthor={Panos Achlioptas and Olga Diamanti and Ioannis Mitliagkas and Leonidas Guibas},\nyear={2018},\nurl={https://openreview.net/forum?id=BJInEZsTb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BJInEZsTb", "pdf_size": 0, "rating": "5;6;8", "confidence": "4;5;5", "rating_avg": 6.333333333333333, "confidence_avg": 4.666666666666667, "replies_avg": 14, "authors#_avg": 4, "corr_rating_confidence": 0.7559289460184544, "gs_citation": 1765, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9902857073066842718&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6 }, { "id": "BJInMmWC-", "title": "Generative Entity Networks: Disentangling Entitites and Attributes in Visual Scenes using Partial Natural Language Descriptions", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generative image models have made significant progress in the last few years, and are now able to generate low-resolution images which sometimes look realistic. However the state-of-the-art models utilize fully entangled latent representations where small changes to a single neuron can effect every output pixel in relatively arbitrary ways, and different neurons have possibly arbitrary relationships with each other. This limits the ability of such models to generalize to new combinations or orientations of objects as well as their ability to connect with more structured representations such as natural language, without explicit strong supervision. In this work explore the synergistic effect of using partial natural language scene descriptions to help disentangle the latent entities visible an image. We present a novel neural network architecture called Generative Entity Networks, which jointly generates both the natural language descriptions and the images from a set of latent entities. Our model is based on the variational autoencoder framework and makes use of visual attention to identify and characterise the visual attributes of each entity. Using the Shapeworld dataset, we show that our representation both enables a better generative model of images, leading to higher quality image samples, as well as creating more semantically useful representations that improve performance over purely dicriminative models on a simple natural language yes/no question answering task.", "keywords": "VAE;Generative Model;Vision;Natural Language", "primary_area": "", "supplementary_material": "", "author": "Charlie Nash;Sebastian Nowozin;Nate Kushman", "authorids": "charlie.nash@ed.ac.uk;sebastian.nowozin@microsoft.com;nate@kushman.org", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nnash2018generative,\ntitle={Generative Entity Networks: Disentangling Entitites and Attributes in Visual Scenes using Partial Natural Language Descriptions},\nauthor={Charlie Nash and Sebastian Nowozin and Nate Kushman},\nyear={2018},\nurl={https://openreview.net/forum?id=BJInMmWC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BJInMmWC-", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;5", "rating_avg": 4.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8343493028126952699&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0 }, { "id": "BJJ9bz-0-", "title": "Reinforcement Learning from Imperfect Demonstrations", "track": "main", "status": "Workshop", "tldr": "", "abstract": "Robust real-world learning should benefit from both demonstrations and interaction with the environment. Current approaches to learning from demonstration and reward perform supervised learning on expert demonstration data and use reinforcement learning to further improve performance based on reward from the environment. These tasks have divergent losses which are difficult to jointly optimize; further, such methods can be very sensitive to noisy demonstrations. We propose a unified reinforcement learning algorithm that effectively normalizes the Q-function, reducing the Q-values of actions unseen in the demonstration data. Our Normalized Actor-Critic (NAC) method can learn from demonstration data of arbitrary quality and also leverages rewards from an interactive environment. NAC learns an initial policy network from demonstration and refines the policy in a real environment. Crucially, both learning from demonstration and interactive refinement use exactly the same objective, unlike prior approaches that combine distinct supervised and reinforcement losses. This makes NAC robust to suboptimal demonstration data, since the method is not forced to mimic all of the examples in the dataset. We show that our unified reinforcement learning algorithm can learn robustly and outperform existing baselines when evaluated on several realistic driving games.", "keywords": "learning from demonstration;reinforcement learning;maximum entropy learning", "primary_area": "", "supplementary_material": "", "author": "Yang Gao;Huazhe(Harry) Xu;Ji Lin;Fisher Yu;Sergey Levine;Trevor Darrell", "authorids": "yg@eecs.berkeley.edu;huazhe_xu@eecs.berkeley.edu;lin-j14@mails.tsinghua.edu.cn;fy@eecs.berkeley.edu;svlevine@eecs.berkeley.edu;trevor@eecs.berkeley.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\ngao2018reinforcement,\ntitle={Reinforcement Learning from Imperfect Demonstrations},\nauthor={Yang Gao and Huazhe(Harry) Xu and Ji Lin and Fisher Yu and Sergey Levine and Trevor Darrell},\nyear={2018},\nurl={https://openreview.net/forum?id=BJJ9bz-0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BJJ9bz-0-", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;4;5", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 6, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 271, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12599937312051323354&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "title": "Deep Autoencoding Gaussian Mixture Model for Unsupervised Anomaly Detection", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/126", "id": "BJJLHbb0-", "author_site": "Bo Zong, Qi Song, Martin Min, Wei Cheng, Cristian Lumezanu, Daeki Cho, Haifeng Chen", "tldr": "An end-to-end trained deep neural network that leverages Gaussian Mixture Modeling to perform density estimation and unsupervised anomaly detection in a low-dimensional space learned by deep autoencoder.", "abstract": "Unsupervised anomaly detection on multi- or high-dimensional data is of great importance in both fundamental machine learning research and industrial applications, for which density estimation lies at the core. Although previous approaches based on dimensionality reduction followed by density estimation have made fruitful progress, they mainly suffer from decoupled model learning with inconsistent optimization goals and incapability of preserving essential information in the low-dimensional space. In this paper, we present a Deep Autoencoding Gaussian Mixture Model (DAGMM) for unsupervised anomaly detection. Our model utilizes a deep autoencoder to generate a low-dimensional representation and reconstruction error for each input data point, which is further fed into a Gaussian Mixture Model (GMM). Instead of using decoupled two-stage training and the standard Expectation-Maximization (EM) algorithm, DAGMM jointly optimizes the parameters of the deep autoencoder and the mixture model simultaneously in an end-to-end fashion, leveraging a separate estimation network to facilitate the parameter learning of the mixture model. The joint optimization, which well balances autoencoding reconstruction, density estimation of latent representation, and regularization, helps the autoencoder escape from less attractive local optima and further reduce reconstruction errors, avoiding the need of pre-training. Experimental results on several public benchmark datasets show that, DAGMM significantly outperforms state-of-the-art anomaly detection techniques, and achieves up to 14% improvement based on the standard F1 score.", "keywords": "Density estimation;unsupervised anomaly detection;high-dimensional data;Deep autoencoder;Gaussian mixture modeling;latent low-dimensional space", "primary_area": "", "supplementary_material": "", "author": "Bo Zong;Qi Song;Martin Renqiang Min;Wei Cheng;Cristian Lumezanu;Daeki Cho;Haifeng Chen", "authorids": "bzong@nec-labs.com;qsong@nec-labs.com;renqiang@nec-labs.com;weicheng@nec-labs.com;lume@nec-labs.com;dkcho@nec-labs.com;haifeng@nec-labs.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nzong2018deep,\ntitle={Deep Autoencoding Gaussian Mixture Model for Unsupervised Anomaly Detection},\nauthor={Bo Zong and Qi Song and Martin Renqiang Min and Wei Cheng and Cristian Lumezanu and Daeki Cho and Haifeng Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BJJLHbb0-},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=BJJLHbb0-)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "8;8;8", "confidence": "5;4;4", "rating_avg": 8.0, "confidence_avg": 4.333333333333333, "replies_avg": 12, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 2410, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5266060849312268888&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=BJJLHbb0-", "pdf": "https://openreview.net/pdf?id=BJJLHbb0-", "email": ";;;;;;", "author_num": 7 }, { "id": "BJLmN8xRW", "title": "Character Level Based Detection of DGA Domain Names", "track": "main", "status": "Reject", "tldr": "A comparison of five deep neural network architectures for detection of malicious domain names shows surprisingly little difference.", "abstract": "Recently several different deep learning architectures have been proposed that take a string of characters as the raw input signal and automatically derive features for text classification. Little studies are available that compare the effectiveness of these approaches for character based text classification with each other. In this paper we perform such an empirical comparison for the important cybersecurity problem of DGA detection: classifying domain names as either benign vs. produced by malware (i.e., by a Domain Generation Algorithm). Training and evaluating on a dataset with 2M domain names shows that there is surprisingly little difference between various convolutional neural network (CNN) and recurrent neural network (RNN) based architectures in terms of accuracy, prompting a preference for the simpler architectures, since they are faster to train and less prone to overfitting.", "keywords": "deep neural networks;short text classification;cybersecurity;domain generation algorithms;malicious domain names", "primary_area": "", "supplementary_material": "", "author": "Bin Yu;Jie Pan;Jiaming Hu;Anderson Nascimento;Martine De Cock", "authorids": "biny@infoblox.com;jiep@uw.edu;huj22@uw.edu;andclay@uw.edu;mdecock@uw.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nyu2018character,\ntitle={Character Level Based Detection of {DGA} Domain Names},\nauthor={Bin Yu and Jie Pan and Jiaming Hu and Anderson Nascimento and Martine De Cock},\nyear={2018},\nurl={https://openreview.net/forum?id=BJLmN8xRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BJLmN8xRW", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;3;4", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 5, "corr_rating_confidence": 0.18898223650461363, "gs_citation": 165, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9830409112747995680&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "BJMuY-gRW", "title": "Jointly Learning Sentence Embeddings and Syntax with Unsupervised Tree-LSTMs", "track": "main", "status": "Reject", "tldr": "Represent sentences by composing them with Tree-LSTMs according to automatically induced parse trees.", "abstract": "We introduce a neural network that represents sentences by composing their words according to induced binary parse trees. We use Tree-LSTM as our composition function, applied along a tree structure found by a fully differentiable natural language chart parser. Our model simultaneously optimises both the composition function and the parser, thus eliminating the need for externally-provided parse trees which are normally required for Tree-LSTM. It can therefore be seen as a tree-based RNN that is unsupervised with respect to the parse trees. As it is fully differentiable, our model is easily trained with an off-the-shelf gradient descent method and backpropagation. We demonstrate that it achieves better performance compared to various supervised Tree-LSTM architectures on a textual entailment task and a reverse dictionary task. Finally, we show how performance can be improved with an attention mechanism which fully exploits the parse chart, by attending over all possible subspans of the sentence.", "keywords": "hierarchical;tree-lstm;treelstm;syntax;composition", "primary_area": "", "supplementary_material": "", "author": "Jean Maillard;Stephen Clark;Dani Yogatama", "authorids": "jean@maillard.it;sc609@cam.ac.uk;dyogatama@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmaillard2018jointly,\ntitle={Jointly Learning Sentence Embeddings and Syntax with Unsupervised Tree-{LSTM}s},\nauthor={Jean Maillard and Stephen Clark and Dani Yogatama},\nyear={2018},\nurl={https://openreview.net/forum?id=BJMuY-gRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BJMuY-gRW", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 94, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18191141837720178044&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "title": "TRAINING GENERATIVE ADVERSARIAL NETWORKS VIA PRIMAL-DUAL SUBGRADIENT METHODS: A LAGRANGIAN PERSPECTIVE ON GAN", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/273", "id": "BJNRFNlRW", "author_site": "Xu Chen, Jiang Wang, Hao Ge", "tldr": "We propose a primal-dual subgradient method for training GANs and this method effectively alleviates mode collapse.", "abstract": "We relate the minimax game of generative adversarial networks (GANs) to finding the saddle points of the Lagrangian function for a convex optimization problem, where the discriminator outputs and the distribution of generator outputs play the roles of primal variables and dual variables, respectively. This formulation shows the connection between the standard GAN training process and the primal-dual subgradient methods for convex optimization. The inherent connection does not only provide a theoretical convergence proof for training GANs in the function space, but also inspires a novel objective function for training. The modified objective function forces the distribution of generator outputs to be updated along the direction according to the primal-dual subgradient methods. A toy example shows that the proposed method is able to resolve mode collapse, which in this case cannot be avoided by the standard GAN or Wasserstein GAN. Experiments on both Gaussian mixture synthetic data and real-world image datasets demonstrate the performance of the proposed method on generating diverse samples.", "keywords": "GAN;Primal-Dual Subgradient;Mode Collapse;Saddle Point", "primary_area": "", "supplementary_material": "", "author": "Xu Chen;Jiang Wang;Hao Ge", "authorids": "chenxugz@gmail.com;wangjiangb@gmail.com;haoge2013@u.northwestern.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nchen2018training,\ntitle={{TRAINING} {GENERATIVE} {ADVERSARIAL} {NETWORKS} {VIA} {PRIMAL}-{DUAL} {SUBGRADIENT} {METHODS}: A {LAGRANGIAN} {PERSPECTIVE} {ON} {GAN}},\nauthor={Xu Chen and Jiang Wang and Hao Ge},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BJNRFNlRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;3;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 18, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10318032410304057699&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=BJNRFNlRW", "pdf": "https://openreview.net/pdf?id=BJNRFNlRW", "email": ";;", "author_num": 3 }, { "title": "Learning to Represent Programs with Graphs", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/216", "id": "BJOFETxR-", "author_site": "Miltiadis Allamanis, Marc Brockschmidt, Mahmoud Khademi", "tldr": "Programs have structure that can be represented as graphs, and graph neural networks can learn to find bugs on such graphs", "abstract": "Learning tasks on source code (i.e., formal languages) have been considered recently, but most work has tried to transfer natural language methods and does not capitalize on the unique opportunities offered by code's known syntax. For example, long-range dependencies induced by using the same variable or function in distant locations are often not considered. We propose to use graphs to represent both the syntactic and semantic structure of code and use graph-based deep learning methods to learn to reason over program structures.\n\nIn this work, we present how to construct graphs from source code and how to scale Gated Graph Neural Networks training to such large graphs. We evaluate our method on two tasks: VarNaming, in which a network attempts to predict the name of a variable given its usage, and VarMisuse, in which the network learns to reason about selecting the correct variable that should be used at a given program location. Our comparison to methods that use less structured program representations shows the advantages of modeling known structure, and suggests that our models learn to infer meaningful names and to solve the VarMisuse task in many cases. Additionally, our testing showed that VarMisuse identifies a number of bugs in mature open-source projects.", "keywords": "programs;source code;graph neural networks", "primary_area": "", "supplementary_material": "", "author": "Miltiadis Allamanis;Marc Brockschmidt;Mahmoud Khademi", "authorids": "miallama@microsoft.com;mabrocks@microsoft.com;mkhademi@sfu.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nallamanis2018learning,\ntitle={Learning to Represent Programs with Graphs},\nauthor={Miltiadis Allamanis and Marc Brockschmidt and Mahmoud Khademi},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BJOFETxR-},\n}", "github": "[![github](/images/github_icon.svg) Microsoft/graph-based-code-modelling](https://github.com/Microsoft/graph-based-code-modelling) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=BJOFETxR-)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "8;8;8", "confidence": "4;4;4", "rating_avg": 8.0, "confidence_avg": 4.0, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 1066, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9342740598325165289&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=BJOFETxR-", "pdf": "https://openreview.net/pdf?id=BJOFETxR-", "email": ";;", "author_num": 3 }, { "id": "BJQPG5lR-", "title": "Avoiding degradation in deep feed-forward networks by phasing out skip-connections", "track": "main", "status": "Reject", "tldr": "Phasing out skip-connections in a principled manner avoids degradation in deep feed-forward networks.", "abstract": "A widely observed phenomenon in deep learning is the degradation problem: increasing\nthe depth of a network leads to a decrease in performance on both test and training data. Novel architectures such as ResNets and Highway networks have addressed this issue by introducing various flavors of skip-connections or gating mechanisms. However, the degradation problem persists in the context of plain feed-forward networks. In this work we propose a simple method to address this issue. The proposed method poses the learning of weights in deep networks as a constrained optimization problem where the presence of skip-connections is penalized by Lagrange multipliers. This allows for skip-connections to be introduced during the early stages of training and subsequently phased out in a principled manner. We demonstrate the benefits of such an approach with experiments on MNIST, fashion-MNIST, CIFAR-10 and CIFAR-100 where the proposed method is shown to greatly decrease the degradation effect (compared to plain networks) and is often competitive with ResNets.", "keywords": "optimization;vanishing gradients;shattered gradients;skip-connections", "primary_area": "", "supplementary_material": "", "author": "Ricardo Pio Monti;Sina Tootoonian;Robin Cao", "authorids": "r.monti@ucl.ac.uk;sina@gatsby.ucl.ac.uk;robin.cao@ucl.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\npio2018avoiding,\ntitle={Avoiding degradation in deep feed-forward networks by phasing out skip-connections},\nauthor={Ricardo Pio Monti and Sina Tootoonian and Robin Cao},\nyear={2018},\nurl={https://openreview.net/forum?id=BJQPG5lR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BJQPG5lR-", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;5;4", "rating_avg": 6.0, "confidence_avg": 4.333333333333333, "replies_avg": 18, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9038356097842721883&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "title": "Hierarchical Representations for Efficient Architecture Search", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/45", "id": "BJQRKzbA-", "author_site": "Hanxiao Liu, Karen Simonyan, Oriol Vinyals, Chrisantha Fernando, Koray Kavukcuoglu", "tldr": "In this paper we propose a hierarchical architecture representation in which doing random or evolutionary architecture search yields highly competitive results using fewer computational resources than the prior art.", "abstract": "We explore efficient neural architecture search methods and show that a simple yet powerful evolutionary algorithm can discover new architectures with excellent performance. Our approach combines a novel hierarchical genetic representation scheme that imitates the modularized design pattern commonly adopted by human experts, and an expressive search space that supports complex topologies. Our algorithm efficiently discovers architectures that outperform a large number of manually designed models for image classification, obtaining top-1 error of 3.6% on CIFAR-10 and 20.3% when transferred to ImageNet, which is competitive with the best existing neural architecture search approaches. We also present results using random search, achieving 0.3% less top-1 accuracy on CIFAR-10 and 0.1% less on ImageNet whilst reducing the search time from 36 hours down to 1 hour.", "keywords": "deep learning;architecture search", "primary_area": "", "supplementary_material": "", "author": "Hanxiao Liu;Karen Simonyan;Oriol Vinyals;Chrisantha Fernando;Koray Kavukcuoglu", "authorids": "hanxiaol@cs.cmu.edu;simonyan@google.com;vinyals@google.com;chrisantha@google.com;korayk@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nliu2018hierarchical,\ntitle={Hierarchical Representations for Efficient Architecture Search},\nauthor={Hanxiao Liu and Karen Simonyan and Oriol Vinyals and Chrisantha Fernando and Koray Kavukcuoglu},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BJQRKzbA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;8", "confidence": "3;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 5, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 1205, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8727964422666186494&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=BJQRKzbA-", "pdf": "https://openreview.net/pdf?id=BJQRKzbA-", "email": ";;;;", "author_num": 5 }, { "title": "Compressing Word Embeddings via Deep Compositional Code Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/242", "id": "BJRZzFlRb", "author_site": "Raphael Shu, Hideki Nakayama", "tldr": "Compressing the word embeddings over 94% without hurting the performance.", "abstract": "Natural language processing (NLP) models often require a massive number of parameters for word embeddings, resulting in a large storage or memory footprint. Deploying neural NLP models to mobile devices requires compressing the word embeddings without any significant sacrifices in performance. For this purpose, we propose to construct the embeddings with few basis vectors. For each word, the composition of basis vectors is determined by a hash code. To maximize the compression rate, we adopt the multi-codebook quantization approach instead of binary coding scheme. Each code is composed of multiple discrete numbers, such as (3, 2, 1, 8), where the value of each component is limited to a fixed range. We propose to directly learn the discrete codes in an end-to-end neural network by applying the Gumbel-softmax trick. Experiments show the compression rate achieves 98% in a sentiment analysis task and 94% ~ 99% in machine translation tasks without performance loss. In both tasks, the proposed method can improve the model performance by slightly lowering the compression rate. Compared to other approaches such as character-level segmentation, the proposed method is language-independent and does not require modifications to the network architecture.", "keywords": "natural language processing;word embedding;compression;deep learning", "primary_area": "", "supplementary_material": "", "author": "Raphael Shu;Hideki Nakayama", "authorids": "shu@nlab.ci.i.u-tokyo.ac.jp;nakayama@ci.i.u-tokyo.ac.jp", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nshu2018compressing,\ntitle={Compressing Word Embeddings via Deep Compositional Code Learning},\nauthor={Raphael Shu and Hideki Nakayama},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BJRZzFlRb},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=BJRZzFlRb)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;4;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 166, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7815180689701290231&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=BJRZzFlRb", "pdf": "https://openreview.net/pdf?id=BJRZzFlRb", "email": ";", "author_num": 2 }, { "id": "BJRxfZbAW", "title": "The Context-Aware Learner", "track": "main", "status": "Reject", "tldr": "", "abstract": "One important aspect of generalization in machine learning involves reasoning about previously seen data in new settings. Such reasoning requires learning disentangled representations of data which are interpretable in isolation, but can also be combined in a new, unseen scenario. To this end, we introduce the context-aware learner, a model based on the variational autoencoding framework, which can learn such representations across data sets exhibiting a number of distinct contexts. Moreover, it is successfully able to combine these representations to generate data not seen at training time. The model enjoys an exponential increase in representational ability for a linear increase in context count. We demonstrate that the theory readily extends to a meta-learning setting such as this, and describe a fully unsupervised model in complete generality. Finally, we validate our approach using an adaptation with weak supervision.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Conor Durkan;Amos Storkey;Harrison Edwards", "authorids": "conor.durkan@ed.ac.uk;a.storkey@ed.ac.uk;h.l.edwards@sms.ed.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ndurkan2018the,\ntitle={The Context-Aware Learner},\nauthor={Conor Durkan and Amos Storkey and Harrison Edwards},\nyear={2018},\nurl={https://openreview.net/forum?id=BJRxfZbAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BJRxfZbAW", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;3;5", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "BJ_QxP1AZ", "title": "Unleashing the Potential of CNNs for Interpretable Few-Shot Learning", "track": "main", "status": "Reject", "tldr": "We enable ordinary CNNs for few-shot learning by exploiting visual concepts which are interpretable visual cues learnt within CNNs.", "abstract": "Convolutional neural networks (CNNs) have been generally acknowledged as one of the driving forces for the advancement of computer vision. Despite their promising performances on many tasks, CNNs still face major obstacles on the road to achieving ideal machine intelligence. One is that CNNs are complex and hard to interpret. Another is that standard CNNs require large amounts of annotated data, which is sometimes very hard to obtain, and it is desirable to be able to learn them from few examples. In this work, we address these limitations of CNNs by developing novel, simple, and interpretable models for few-shot learn- ing. Our models are based on the idea of encoding objects in terms of visual concepts, which are interpretable visual cues represented by the feature vectors within CNNs. We first adapt the learning of visual concepts to the few-shot setting, and then uncover two key properties of feature encoding using visual concepts, which we call category sensitivity and spatial pattern. Motivated by these properties, we present two intuitive models for the problem of few-shot learning. Experiments show that our models achieve competitive performances, while being much more flexible and interpretable than alternative state-of-the-art few-shot learning methods. We conclude that using visual concepts helps expose the natural capability of CNNs for few-shot learning.", "keywords": "Few-Shot Learning;Neural Network Understanding;Visual Concepts", "primary_area": "", "supplementary_material": "", "author": "Boyang Deng;Qing Liu;Siyuan Qiao;Alan Yuille", "authorids": "billydeng@buaa.edu.cn;qingliu@jhu.edu;siyuan.qiao@jhu.edu;alan.yuille@jhu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ndeng2018unleashing,\ntitle={Unleashing the Potential of {CNN}s for Interpretable Few-Shot Learning},\nauthor={Boyang Deng and Qing Liu and Siyuan Qiao and Alan Yuille},\nyear={2018},\nurl={https://openreview.net/forum?id=BJ_QxP1AZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BJ_QxP1AZ", "pdf_size": 0, "rating": "4;5;7", "confidence": "5;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": -0.7559289460184544, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10983750414995444959&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "title": "Recasting Gradient-Based Meta-Learning as Hierarchical Bayes", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/313", "id": "BJ_UL-k0b", "author_site": "Erin Grant, Chelsea Finn, Sergey Levine, Trevor Darrell, Thomas L Griffiths", "tldr": "A specific gradient-based meta-learning algorithm, MAML, is equivalent to an inference procedure in a hierarchical Bayesian model. We use this connection to improve MAML via methods from approximate inference and curvature estimation.", "abstract": "Meta-learning allows an intelligent agent to leverage prior learning episodes as a basis for quickly improving performance on a novel task. Bayesian hierarchical modeling provides a theoretical framework for formalizing meta-learning as inference for a set of parameters that are shared across tasks. Here, we reformulate the model-agnostic meta-learning algorithm (MAML) of Finn et al. (2017) as a method for probabilistic inference in a hierarchical Bayesian model. In contrast to prior methods for meta-learning via hierarchical Bayes, MAML is naturally applicable to complex function approximators through its use of a scalable gradient descent procedure for posterior inference. Furthermore, the identification of MAML as hierarchical Bayes provides a way to understand the algorithm\u2019s operation as a meta-learning procedure, as well as an opportunity to make use of computational strategies for efficient inference. We use this opportunity to propose an improvement to the MAML algorithm that makes use of techniques from approximate inference and curvature estimation.", "keywords": "meta-learning;learning to learn;hierarchical Bayes;approximate Bayesian methods", "primary_area": "", "supplementary_material": "", "author": "Erin Grant;Chelsea Finn;Sergey Levine;Trevor Darrell;Thomas Griffiths", "authorids": "eringrant@berkeley.edu;cbfinn@eecs.berkeley.edu;svlevine@eecs.berkeley.edu;trevor@eecs.berkeley.edu;tom_griffiths@berkeley.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ngrant2018recasting,\ntitle={Recasting Gradient-Based Meta-Learning as Hierarchical Bayes},\nauthor={Erin Grant and Chelsea Finn and Sergey Levine and Trevor Darrell and Thomas Griffiths},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BJ_UL-k0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;3;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.0, "replies_avg": 13, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 686, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5613028967445157198&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=BJ_UL-k0b", "pdf": "https://openreview.net/pdf?id=BJ_UL-k0b", "email": ";;;;", "author_num": 5 }, { "title": "Deep Rewiring: Training very sparse deep networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/291", "id": "BJ_wN01C-", "author_site": "Guillaume Bellec, David Kappel, Wolfgang Maass, Robert Legenstein", "tldr": "The paper presents Deep Rewiring, an algorithm that can be used to train deep neural networks when the network connectivity is severely constrained during training.", "abstract": "Neuromorphic hardware tends to pose limits on the connectivity of deep networks that one can run on them. But also generic hardware and software implementations of deep learning run more efficiently for sparse networks. Several methods exist for pruning connections of a neural network after it was trained without connectivity constraints. We present an algorithm, DEEP R, that enables us to train directly a sparsely connected neural network. DEEP R automatically rewires the network during supervised training so that connections are there where they are most needed for the task, while its total number is all the time strictly bounded. We demonstrate that DEEP R can be used to train very sparse feedforward and recurrent neural networks on standard benchmark tasks with just a minor loss in performance. DEEP R is based on a rigorous theoretical foundation that views rewiring as stochastic sampling of network configurations from a posterior.", "keywords": "deep learning;pruning;LSTM;convolutional networks;recurrent neural network;sparse networks;neuromorphic hardware;energy efficient computing;low memory hardware;stochastic differential equation;fokker-planck equation", "primary_area": "", "supplementary_material": "", "author": "Guillaume Bellec;David Kappel;Wolfgang Maass;Robert Legenstein", "authorids": "bellec@igi.tugraz.at;kappel@igi.tugraz.at;maass@igi.tugraz.at;legenstein@igi.tugraz.at", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nbellec2018deep,\ntitle={Deep Rewiring: Training very sparse deep networks},\nauthor={Guillaume Bellec and David Kappel and Wolfgang Maass and Robert Legenstein},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BJ_wN01C-},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 4 community implementations](https://paperswithcode.com/paper/?openreview=BJ_wN01C-)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "5;6;8", "confidence": "5;4;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": -0.7559289460184544, "gs_citation": 352, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6848423277041156645&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=BJ_wN01C-", "pdf": "https://openreview.net/pdf?id=BJ_wN01C-", "email": ";;;", "author_num": 4 }, { "id": "BJaU__eCZ", "title": "Hallucinating brains with artificial brains", "track": "main", "status": "Reject", "tldr": "Two novel GANs are constructed to generate high-quality 3D fMRI brain images and synthetic brain images greatly help to improve downstream classification tasks.", "abstract": "Human brain function as measured by functional magnetic resonance imaging\n(fMRI), exhibits a rich diversity. In response, understanding the individual variability\nof brain function and its association with behavior has become one of the\nmajor concerns in modern cognitive neuroscience. Our work is motivated by the\nview that generative models provide a useful tool for understanding this variability.\nTo this end, this manuscript presents two novel generative models trained\non real neuroimaging data which synthesize task-dependent functional brain images.\nBrain images are high dimensional tensors which exhibit structured spatial\ncorrelations. Thus, both models are 3D conditional Generative Adversarial networks\n(GANs) which apply Convolutional Neural Networks (CNNs) to learn an\nabstraction of brain image representations. Our results show that the generated\nbrain images are diverse, yet task dependent. In addition to qualitative evaluation,\nwe utilize the generated synthetic brain volumes as additional training data to improve\ndownstream fMRI classifiers (also known as decoding, or brain reading).\nOur approach achieves significant improvements for a variety of datasets, classifi-\ncation tasks and evaluation scores. Our classification results provide a quantitative\nevaluation of the quality of the generated images, and also serve as an additional\ncontribution of this manuscript.", "keywords": "3D fMRI data;Deep Learning;Generative Adversarial Network;Classification", "primary_area": "", "supplementary_material": "", "author": "Peiye Zhuang;Alexander G. Schwing;Oluwasanmi Koyejo", "authorids": "py_zhuang@bupt.edu.cn;aschwing@illinois.edu;sanmi@illinois.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzhuang2018hallucinating,\ntitle={Hallucinating brains with artificial brains},\nauthor={Peiye Zhuang and Alexander G. Schwing and Oluwasanmi Koyejo},\nyear={2018},\nurl={https://openreview.net/forum?id=BJaU__eCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BJaU__eCZ", "pdf_size": 0, "rating": "5;6;8", "confidence": "3;4;5", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.9819805060619659, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:wSxpoqQINzQJ:scholar.google.com/&scioq=Hallucinating+brains+with+artificial+brains&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "BJcAWaeCW", "title": "Graph Topological Features via GAN", "track": "main", "status": "Reject", "tldr": "A GAN based method to learn important topological features of an arbitrary input graph.", "abstract": "Inspired by the success of generative adversarial networks (GANs) in image domains, we introduce a novel hierarchical architecture for learning characteristic topological features from a single arbitrary input graph via GANs. The hierarchical architecture consisting of multiple GANs preserves both local and global topological features, and automatically partitions the input graph into representative stages for feature learning. The stages facilitate reconstruction and can be used as indicators of the importance of the associated topological structures. Experiments show that our method produces subgraphs retaining a wide range of topological features, even in early reconstruction stages. This paper contains original research on combining the use of GANs and graph topological analysis.", "keywords": "graph topology;GAN;network science;hierarchical learning", "primary_area": "", "supplementary_material": "", "author": "Weiyi Liu;Hal Cooper;Min-Hwan Oh", "authorids": "weiyiliu@us.ibm.com;hal.cooper@columbia.edu;m.oh@columbia.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nliu2018graph,\ntitle={Graph Topological Features via {GAN}},\nauthor={Weiyi Liu and Hal Cooper and Min-Hwan Oh},\nyear={2018},\nurl={https://openreview.net/forum?id=BJcAWaeCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BJcAWaeCW", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;5;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YWorq8Zwe7wJ:scholar.google.com/&scioq=Graph+Topological+Features+via+GAN&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Do GANs learn the distribution? Some Theory and Empirics", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/72", "id": "BJehNfW0-", "author_site": "Sanjeev Arora, Andrej Risteski, Yi Zhang", "tldr": "We propose a support size estimator of GANs's learned distribution to show they indeed suffer from mode collapse, and we prove that encoder-decoder GANs do not avoid the issue as well.", "abstract": "Do GANS (Generative Adversarial Nets) actually learn the target distribution? The foundational paper of Goodfellow et al. (2014) suggested they do, if they were given sufficiently large deep nets, sample size, and computation time. A recent theoretical analysis in Arora et al. (2017) raised doubts whether the same holds when discriminator has bounded size. It showed that the training objective can approach its optimum value even if the generated distribution has very low support. In other words, the training objective is unable to prevent mode collapse. The current paper makes two contributions. (1) It proposes a novel test for estimating support size using the birthday paradox of discrete probability. Using this evidence is presented that well-known GANs approaches do learn distributions of fairly low support. (2) It theoretically studies encoder-decoder GANs architectures (e.g., BiGAN/ALI), which were proposed to learn more meaningful features via GANs, and consequently to also solve the mode-collapse issue. Our result shows that such encoder-decoder training objectives also cannot guarantee learning of the full distribution because they cannot prevent serious mode collapse. More seriously, they cannot prevent learning meaningless codes for data, contrary to usual intuition.", "keywords": "Generative Adversarial Networks;mode collapse;birthday paradox;support size estimation", "primary_area": "", "supplementary_material": "", "author": "Sanjeev Arora;Andrej Risteski;Yi Zhang", "authorids": "arora@cs.princeton.edu;risteski@cs.princeton.edu;y.zhang@cs.princeton.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\narora2018do,\ntitle={Do {GAN}s learn the distribution? Some Theory and Empirics},\nauthor={Sanjeev Arora and Andrej Risteski and Yi Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BJehNfW0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;3;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 191, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15435512625777710261&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=BJehNfW0-", "pdf": "https://openreview.net/pdf?id=BJehNfW0-", "email": ";;", "author_num": 3 }, { "id": "BJgPCveAW", "title": "Characterizing Sparse Connectivity Patterns in Neural Networks", "track": "main", "status": "Reject", "tldr": "Neural networks can be pre-defined to have sparse connectivity without performance degradation.", "abstract": "We propose a novel way of reducing the number of parameters in the storage-hungry fully connected layers of a neural network by using pre-defined sparsity, where the majority of connections are absent prior to starting training. Our results indicate that convolutional neural networks can operate without any loss of accuracy at less than 0.5% classification layer connection density, or less than 5% overall network connection density. We also investigate the effects of pre-defining the sparsity of networks with only fully connected layers. Based on our sparsifying technique, we introduce the `scatter' metric to characterize the quality of a particular connection pattern. As proof of concept, we show results on CIFAR, MNIST and a new dataset on classifying Morse code symbols, which highlights some interesting trends and limits of sparse connection patterns.", "keywords": "Machine learning;Neural networks;Sparse neural networks;Pre-defined sparsity;Scatter;Connectivity patterns;Adjacency matrix;Parameter Reduction;Morse code", "primary_area": "", "supplementary_material": "", "author": "Sourya Dey;Kuan-Wen Huang;Peter A. Beerel;Keith M. Chugg", "authorids": "souryade@usc.edu;kuanwenh@usc.edu;pabeerel@usc.edu;chugg@usc.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ndey2018characterizing,\ntitle={Characterizing Sparse Connectivity Patterns in Neural Networks},\nauthor={Sourya Dey and Kuan-Wen Huang and Peter A. Beerel and Keith M. Chugg},\nyear={2018},\nurl={https://openreview.net/forum?id=BJgPCveAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BJgPCveAW", "pdf_size": 0, "rating": "4;4;5", "confidence": "3;3;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.0, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2774416344966290429&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "BJgVaG-Ab", "title": "AUTOMATA GUIDED HIERARCHICAL REINFORCEMENT LEARNING FOR ZERO-SHOT SKILL COMPOSITION", "track": "main", "status": "Reject", "tldr": "Combine temporal logic with hierarchical reinforcement learning for skill composition", "abstract": "An obstacle that prevents the wide adoption of (deep) reinforcement learning (RL) in control systems is its need for a large number of interactions with the environment in order to master a skill. The learned skill usually generalizes poorly across domains and re-training is often necessary when presented with a new task. We present a framework that combines techniques in \\textit{formal methods} with \\textit{hierarchical reinforcement learning} (HRL). The set of techniques we provide allows for the convenient specification of tasks with logical expressions, learns hierarchical policies (meta-controller and low-level controllers) with well-defined intrinsic rewards using any RL methods and is able to construct new skills from existing ones without additional learning. We evaluate the proposed methods in a simple grid world simulation as well as simulation on a Baxter robot. ", "keywords": "Hierarchical reinforcement learning;temporal logic;skill composition", "primary_area": "", "supplementary_material": "", "author": "Xiao Li;Yao Ma;Calin Belta", "authorids": "xli87@bu.edu;yaoma@bu.edu;cbelta@bu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nli2018automata,\ntitle={{AUTOMATA} {GUIDED} {HIERARCHICAL} {REINFORCEMENT} {LEARNING} {FOR} {ZERO}-{SHOT} {SKILL} {COMPOSITION}},\nauthor={Xiao Li and Yao Ma and Calin Belta},\nyear={2018},\nurl={https://openreview.net/forum?id=BJgVaG-Ab},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BJgVaG-Ab", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;3;4", "rating_avg": 4.0, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2404912283424727386&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "BJgd7m0xRZ", "title": "Unsupervised Adversarial Anomaly Detection using One-Class Support Vector Machines", "track": "main", "status": "Reject", "tldr": "A novel method to increase the resistance of OCSVMs against targeted, integrity attacks by selective nonlinear transformations of data to lower dimensions.", "abstract": "Anomaly detection discovers regular patterns in unlabeled data and identifies the non-conforming data points, which in some cases are the result of malicious attacks by adversaries. Learners such as One-Class Support Vector Machines (OCSVMs) have been successfully in anomaly detection, yet their performance may degrade significantly in the presence of sophisticated adversaries, who target the algorithm itself by compromising the integrity of the training data. With the rise in the use of machine learning in mission critical day-to-day activities where errors may have significant consequences, it is imperative that machine learning systems are made secure. To address this, we propose a defense mechanism that is based on a contraction of the data, and we test its effectiveness using OCSVMs. The proposed approach introduces a layer of uncertainty on top of the OCSVM learner, making it infeasible for the adversary to guess the specific configuration of the learner. We theoretically analyze the effects of adversarial perturbations on the separating margin of OCSVMs and provide empirical evidence on several benchmark datasets, which show that by carefully contracting the data in low dimensional spaces, we can successfully identify adversarial samples that would not have been identifiable in the original dimensional space. The numerical results show that the proposed method improves OCSVMs performance significantly (2-7%)", "keywords": "anomaly detection;one class support vector machine;adversarial learning", "primary_area": "", "supplementary_material": "", "author": "Prameesha Sandamal Weerasinghe;Tansu Alpcan;Sarah Monazam Erfani;Christopher Leckie", "authorids": "pweerasinghe@student.unimelb.edu.au;tansu.alpcan@unimelb.edu.au;sarah.erfani@unimelb.edu.au;caleckie@unimelb.edu.au", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsandamal2018unsupervised,\ntitle={Unsupervised Adversarial Anomaly Detection using One-Class Support Vector Machines},\nauthor={Prameesha Sandamal Weerasinghe and Tansu Alpcan and Sarah Monazam Erfani and Christopher Leckie},\nyear={2018},\nurl={https://openreview.net/forum?id=BJgd7m0xRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BJgd7m0xRZ", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12247573034775160452&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "BJhxcGZCW", "title": "Generative Discovery of Relational Medical Entity Pairs", "track": "main", "status": "Reject", "tldr": "Generatively discover meaningful, novel entity pairs with a certain medical relationship by purely learning from the existing meaningful entity pairs, without the requirement of additional text corpus for discriminative extraction.", "abstract": "Online healthcare services can provide the general public with ubiquitous access to medical knowledge and reduce the information access cost for both individuals and societies. To promote these benefits, it is desired to effectively expand the scale of high-quality yet novel relational medical entity pairs that embody rich medical knowledge in a structured form. To fulfill this goal, we introduce a generative model called Conditional Relationship Variational Autoencoder (CRVAE), which can discover meaningful and novel relational medical entity pairs without the requirement of additional external knowledge. Rather than discriminatively identifying the relationship between two given medical entities in a free-text corpus, we directly model and understand medical relationships from diversely expressed medical entity pairs. The proposed model introduces the generative modeling capacity of variational autoencoder to entity pairs, and has the ability to discover new relational medical entity pairs solely based on the existing entity pairs. Beside entity pairs, relationship-enhanced entity representations are obtained as another appealing benefit of the proposed method. Both quantitative and qualitative evaluations on real-world medical datasets demonstrate the effectiveness of the proposed method in generating relational medical entity pairs that are meaningful and novel.", "keywords": "Knowledge Discovery;Generative Modeling;Medical;Entity Pair", "primary_area": "", "supplementary_material": "", "author": "Chenwei Zhang;Yaliang Li;Nan Du;Wei Fan;Philip S. Yu", "authorids": "czhang99@uic.edu;yaliangli@baidu.com;nandu@baidu.com;davidwfan@tencent.com;psyu@uic.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nzhang2018generative,\ntitle={Generative Discovery of Relational Medical Entity Pairs},\nauthor={Chenwei Zhang and Yaliang Li and Nan Du and Wei Fan and Philip S. Yu},\nyear={2018},\nurl={https://openreview.net/forum?id=BJhxcGZCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BJhxcGZCW", "pdf_size": 0, "rating": "2;4;4", "confidence": "5;3;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 5, "corr_rating_confidence": -0.8660254037844387, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=420819078877087404&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "A Bayesian Perspective on Generalization and Stochastic Gradient Descent", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/289", "id": "BJij4yg0Z", "author_site": "Samuel Smith, Quoc V Le", "tldr": "Generalization is strongly correlated with the Bayesian evidence, and gradient noise drives SGD towards minima whose evidence is large.", "abstract": "We consider two questions at the heart of machine learning; how can we predict if a minimum will generalize to the test set, and why does stochastic gradient descent find minima that generalize well? Our work responds to \\citet{zhang2016understanding}, who showed deep neural networks can easily memorize randomly labeled training data, despite generalizing well on real labels of the same inputs. We show that the same phenomenon occurs in small linear models. These observations are explained by the Bayesian evidence, which penalizes sharp minima but is invariant to model parameterization. We also demonstrate that, when one holds the learning rate fixed, there is an optimum batch size which maximizes the test set accuracy. We propose that the noise introduced by small mini-batches drives the parameters towards minima whose evidence is large. Interpreting stochastic gradient descent as a stochastic differential equation, we identify the ``noise scale\" $g = \\epsilon (\\frac{N}{B} - 1) \\approx \\epsilon N/B$, where $\\epsilon$ is the learning rate, $N$ the training set size and $B$ the batch size. Consequently the optimum batch size is proportional to both the learning rate and the size of the training set, $B_{opt} \\propto \\epsilon N$. We verify these predictions empirically.", "keywords": "generalization;stochastic gradient descent;stochastic differential equations;scaling rules;large batch training;bayes theorem;batch size", "primary_area": "", "supplementary_material": "", "author": "Samuel L. Smith and Quoc V. Le;Samuel L. Smith and Quoc V. Le", "authorids": "slsmith@google.com;qvl@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nl.2018a,\ntitle={A Bayesian Perspective on Generalization and Stochastic Gradient Descent},\nauthor={Samuel L. Smith and Quoc V. Le},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BJij4yg0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "3;7;7", "confidence": "4;4;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 18, "authors#_avg": 2, "corr_rating_confidence": -0.5, "gs_citation": 448, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13471305971267525875&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=BJij4yg0Z", "pdf": "https://openreview.net/pdf?id=BJij4yg0Z", "email": ";", "author_num": 2 }, { "title": "Few-Shot Learning with Graph Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/43", "id": "BJj6qGbRW", "author_site": "Victor Garcia Satorras, Joan Bruna", "tldr": "", "abstract": "We propose to study the problem of few-shot learning with the prism of inference on a partially observed graphical model, constructed from a collection of input images whose label can be either observed or not. By assimilating generic message-passing inference algorithms with their neural-network counterparts, we define a graph neural network architecture that generalizes several of the recently proposed few-shot learning models. Besides providing improved numerical performance, our framework is easily extended to variants of few-shot learning, such as semi-supervised or active learning, demonstrating the ability of graph-based models to operate well on \u2018relational\u2019 tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Victor Garcia Satorras;Joan Bruna Estrach", "authorids": "vgsatorras@gmail.com;bruna@cims.nyu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\ngarcia2018fewshot,\ntitle={Few-Shot Learning with Graph Neural Networks},\nauthor={Victor Garcia Satorras and Joan Bruna Estrach},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BJj6qGbRW},\n}", "github": "[![github](/images/github_icon.svg) vgsatorras/few-shot-gnn](https://github.com/vgsatorras/few-shot-gnn) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=BJj6qGbRW)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;4;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 17, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 1141, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15420545241088720867&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=BJj6qGbRW", "pdf": "https://openreview.net/pdf?id=BJj6qGbRW", "email": ";", "author_num": 2 }, { "id": "BJjBnN9a-", "title": "Continuous Convolutional Neural Networks for Image Classification", "track": "main", "status": "Reject", "tldr": "This paper proposes a novel convolutional layer that operates in a continuous Reproducing Kernel Hilbert Space.", "abstract": "This paper introduces the concept of continuous convolution to neural networks and deep learning applications in general. Rather than directly using discretized information, input data is first projected into a high-dimensional Reproducing Kernel Hilbert Space (RKHS), where it can be modeled as a continuous function using a series of kernel bases. We then proceed to derive a closed-form solution to the continuous convolution operation between two arbitrary functions operating in different RKHS. Within this framework, convolutional filters also take the form of continuous functions, and the training procedure involves learning the RKHS to which each of these filters is projected, alongside their weight parameters. This results in much more expressive filters, that do not require spatial discretization and benefit from properties such as adaptive support and non-stationarity. Experiments on image classification are performed, using classical datasets, with results indicating that the proposed continuous convolutional neural network is able to achieve competitive accuracy rates with far fewer parameters and a faster convergence rate.", "keywords": "convolutional neural networks;image classification;deep learning;feature representation;hilbert maps;reproducing kernel hilbert space", "primary_area": "", "supplementary_material": "", "author": "Vitor Guizilini;Fabio Ramos", "authorids": "vitor.guizilini@sydney.edu.au;fabio.ramos@sydney.edu.au", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nguizilini2018continuous,\ntitle={Continuous Convolutional Neural Networks for Image Classification},\nauthor={Vitor Guizilini and Fabio Ramos},\nyear={2018},\nurl={https://openreview.net/forum?id=BJjBnN9a-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BJjBnN9a-", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;3;2", "rating_avg": 5.0, "confidence_avg": 3.0, "replies_avg": 11, "authors#_avg": 2, "corr_rating_confidence": -1.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17613032149958421718&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "BJjquybCW", "title": "The loss surface and expressivity of deep convolutional neural networks", "track": "main", "status": "Workshop", "tldr": "", "abstract": "We analyze the expressiveness and loss surface of practical deep convolutional\nneural networks (CNNs) with shared weights and max pooling layers. We show\nthat such CNNs produce linearly independent features at a \u201cwide\u201d layer which\nhas more neurons than the number of training samples. This condition holds e.g.\nfor the VGG network. Furthermore, we provide for such wide CNNs necessary\nand sufficient conditions for global minima with zero training error. For the case\nwhere the wide layer is followed by a fully connected layer we show that almost\nevery critical point of the empirical loss is a global minimum with zero training\nerror. Our analysis suggests that both depth and width are very important in deep\nlearning. While depth brings more representational power and allows the network\nto learn high level features, width smoothes the optimization landscape of the\nloss function in the sense that a sufficiently wide network has a well-behaved loss\nsurface with almost no bad local minima.", "keywords": "convolutional neural networks;loss surface;expressivity;critical point;global minima;linear separability", "primary_area": "", "supplementary_material": "", "author": "Quynh Nguyen;Matthias Hein", "authorids": "quynh@cs.uni-saarland.de;hein@cs.uni-saarland.de", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nnguyen2018the,\ntitle={The loss surface and expressivity of deep convolutional neural networks},\nauthor={Quynh Nguyen and Matthias Hein},\nyear={2018},\nurl={https://openreview.net/forum?id=BJjquybCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=BJjquybCW", "pdf_size": 0, "rating": "4;5;6;7", "confidence": "4;2;3;2", "rating_avg": 5.5, "confidence_avg": 2.75, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": -0.674199862463242, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2975418608598838581&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "title": "Guide Actor-Critic for Continuous Control", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/186", "id": "BJk59JZ0b", "author_site": "Voot Tangkaratt, , Masashi Sugiyama", "tldr": "This paper proposes a novel actor-critic method that uses Hessians of a critic to update an actor.", "abstract": "Actor-critic methods solve reinforcement learning problems by updating a parameterized policy known as an actor in a direction that increases an estimate of the expected return known as a critic. However, existing actor-critic methods only use values or gradients of the critic to update the policy parameter. In this paper, we propose a novel actor-critic method called the guide actor-critic (GAC). GAC firstly learns a guide actor that locally maximizes the critic and then it updates the policy parameter based on the guide actor by supervised learning. Our main theoretical contributions are two folds. First, we show that GAC updates the guide actor by performing second-order optimization in the action space where the curvature matrix is based on the Hessians of the critic. Second, we show that the deterministic policy gradient method is a special case of GAC when the Hessians are ignored. Through experiments, we show that our method is a promising reinforcement learning method for continuous controls.\n", "keywords": "Reinforcement learning;actor-critic;continuous control", "primary_area": "", "supplementary_material": "", "author": "Voot Tangkaratt;Abbas Abdolmaleki;Masashi Sugiyama", "authorids": "voot.tangkaratt@riken.jp;abbas.a@ua.pt;masashi.sugiyama@riken.jp", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ntangkaratt2018guide,\ntitle={Guide Actor-Critic for Continuous Control},\nauthor={Voot Tangkaratt and Abbas Abdolmaleki and Masashi Sugiyama},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BJk59JZ0b},\n}", "github": "[![github](/images/github_icon.svg) voot-t/guide-actor-critic](https://github.com/voot-t/guide-actor-critic)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;2;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": -0.1889822365046137, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6316181617581438246&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=BJk59JZ0b", "pdf": "https://openreview.net/pdf?id=BJk59JZ0b", "email": ";;", "author_num": 3 }, { "title": "Global Optimality Conditions for Deep Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/90", "id": "BJk7Gf-CZ", "author_site": "Chulhee Yun, Suvrit Sra, Ali Jadbabaie", "tldr": "We provide efficiently checkable necessary and sufficient conditions for global optimality in deep linear neural networks, with some initial extensions to nonlinear settings.", "abstract": "We study the error landscape of deep linear and nonlinear neural networks with the squared error loss. Minimizing the loss of a deep linear neural network is a nonconvex problem, and despite recent progress, our understanding of this loss surface is still incomplete. For deep linear networks, we present necessary and sufficient conditions for a critical point of the risk function to be a global minimum. Surprisingly, our conditions provide an efficiently checkable test for global optimality, while such tests are typically intractable in nonconvex optimization. We further extend these results to deep nonlinear neural networks and prove similar sufficient conditions for global optimality, albeit in a more limited function space setting.", "keywords": "deep linear neural networks;global optimality;deep learning", "primary_area": "", "supplementary_material": "", "author": "Chulhee Yun;Suvrit Sra;Ali Jadbabaie", "authorids": "chulheey@mit.edu;suvrit@mit.edu;jadbabai@mit.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nyun2018global,\ntitle={Global Optimality Conditions for Deep Neural Networks},\nauthor={Chulhee Yun and Suvrit Sra and Ali Jadbabaie},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BJk7Gf-CZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "5;7;8", "confidence": "5;4;5", "rating_avg": 6.666666666666667, "confidence_avg": 4.666666666666667, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": -0.18898223650461357, "gs_citation": 122, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1941500514244419548&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=BJk7Gf-CZ", "pdf": "https://openreview.net/pdf?id=BJk7Gf-CZ", "email": ";;", "author_num": 3 }, { "id": "BJlrSmbAZ", "title": "Bayesian Uncertainty Estimation for Batch Normalized Deep Networks", "track": "main", "status": "Reject", "tldr": "We show that training a deep network using batch normalization is equivalent to approximate inference in Bayesian models, and we demonstrate how this finding allows us to make useful estimates of the model uncertainty in conventional networks.", "abstract": "Deep neural networks have led to a series of breakthroughs, dramatically improving the state-of-the-art in many domains. The techniques driving these advances, however, lack a formal method to account for model uncertainty. While the Bayesian approach to learning provides a solid theoretical framework to handle uncertainty, inference in Bayesian-inspired deep neural networks is difficult. In this paper, we provide a practical approach to Bayesian learning that relies on a regularization technique found in nearly every modern network, batch normalization. We show that training a deep network using batch normalization is equivalent to approximate inference in Bayesian models, and we demonstrate how this finding allows us to make useful estimates of the model uncertainty. Using our approach, it is possible to make meaningful uncertainty estimates using conventional architectures without modifying the network or the training procedure. Our approach is thoroughly validated in a series of empirical experiments on different tasks and using various measures, showing it to outperform baselines on a majority of datasets with strong statistical significance.", "keywords": "uncertainty estimation;deep learning;Bayesian learning;batch normalization", "primary_area": "", "supplementary_material": "", "author": "Mattias Teye;Hossein Azizpour;Kevin Smith", "authorids": "teye@kth.se;azizpour@kth.se;ksmith@kth.se", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nteye2018bayesian,\ntitle={Bayesian Uncertainty Estimation for Batch Normalized Deep Networks},\nauthor={Mattias Teye and Hossein Azizpour and Kevin Smith},\nyear={2018},\nurl={https://openreview.net/forum?id=BJlrSmbAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BJlrSmbAZ", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 308, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17902835651299889830&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "BJluxbWC-", "title": "Unseen Class Discovery in Open-world Classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper concerns open-world classification, where the classifier not only needs to classify test examples into seen classes that have appeared in training but also reject examples from unseen or novel classes that have not appeared in training. Specifically, this paper focuses on discovering the hidden unseen classes of the rejected examples. Clearly, without prior knowledge this is difficult. However, we do have the data from the seen training classes, which can tell us what kind of similarity/difference is expected for examples from the same class or from different classes. It is reasonable to assume that this knowledge can be transferred to the rejected examples and used to discover the hidden unseen classes in them. This paper aims to solve this problem. It first proposes a joint open classification model with a sub-model for classifying whether a pair of examples belongs to the same or different classes. This sub-model can serve as a distance function for clustering to discover the hidden classes of the rejected examples. Experimental results show that the proposed model is highly promising.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lei Shu;Hu Xu;Bing Liu", "authorids": "lshu3@uic.edu;hxu48@uic.edu;liub@uic.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nshu2018unseen,\ntitle={Unseen Class Discovery in Open-world Classification},\nauthor={Lei Shu and Hu Xu and Bing Liu},\nyear={2018},\nurl={https://openreview.net/forum?id=BJluxbWC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BJluxbWC-", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;5;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 114, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5158863395713350128&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "title": "Dynamic Neural Program Embeddings for Program Repair", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/69", "id": "BJuWrGW0Z", "author_site": "Ke Wang, Rishabh Singh, Zhendong Su", "tldr": "A new way of learning semantic program embedding", "abstract": "Neural program embeddings have shown much promise recently for a variety of program analysis tasks, including program synthesis, program repair, code completion, and fault localization. However, most existing program embeddings are based on syntactic features of programs, such as token sequences or abstract syntax trees. Unlike images and text, a program has well-de\ufb01ned semantics that can be dif\ufb01cult to capture by only considering its syntax (i.e. syntactically similar programs can exhibit vastly different run-time behavior), which makes syntax-based program embeddings fundamentally limited. We propose a novel semantic program embedding that is learned from program execution traces. Our key insight is that program states expressed as sequential tuples of live variable values not only capture program semantics more precisely, but also offer a more natural \ufb01t for Recurrent Neural Networks to model. We evaluate different syntactic and semantic program embeddings on the task of classifying the types of errors that students make in their submissions to an introductory programming class and on the CodeHunt education platform. Our evaluation results show that the semantic program embeddings signi\ufb01cantly outperform the syntactic program embeddings based on token sequences and abstract syntax trees. In addition, we augment a search-based program repair system with predictions made from our semantic embedding and demonstrate signi\ufb01cantly improved search ef\ufb01ciency.\n", "keywords": "Program Embedding;Program Semantics;Dynamic Traces", "primary_area": "", "supplementary_material": "", "author": "Ke Wang;Rishabh Singh;Zhendong Su", "authorids": "kbwang@ucdavis.edu;risin@microsoft.com;su@cs.ucdavis.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nwang2018dynamic,\ntitle={Dynamic Neural Program Embeddings for Program Repair},\nauthor={Ke Wang and Zhendong Su and Rishabh Singh},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BJuWrGW0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "2;3;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.0, "replies_avg": 12, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 169, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11012975812962066685&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=BJuWrGW0Z", "pdf": "https://openreview.net/pdf?id=BJuWrGW0Z", "email": ";;", "author_num": 3 }, { "id": "BJubPWZRW", "title": "Cross-View Training for Semi-Supervised Learning", "track": "main", "status": "Workshop", "tldr": "Self-training with different views of the input gives excellent results for semi-supervised image recognition, sequence tagging, and dependency parsing.", "abstract": "We present Cross-View Training (CVT), a simple but effective method for deep semi-supervised learning. On labeled examples, the model is trained with standard cross-entropy loss. On an unlabeled example, the model first performs inference (acting as a \"teacher\") to produce soft targets. The model then learns from these soft targets (acting as a ``\"student\"). We deviate from prior work by adding multiple auxiliary student prediction layers to the model. The input to each student layer is a sub-network of the full model that has a restricted view of the input (e.g., only seeing one region of an image). The students can learn from the teacher (the full model) because the teacher sees more of each example. Concurrently, the students improve the quality of the representations used by the teacher as they learn to make predictions with limited data. When combined with Virtual Adversarial Training, CVT improves upon the current state-of-the-art on semi-supervised CIFAR-10 and semi-supervised SVHN. We also apply CVT to train models on five natural language processing tasks using hundreds of millions of sentences of unlabeled data. On all tasks CVT substantially outperforms supervised learning alone, resulting in models that improve upon or are competitive with the current state-of-the-art.\n", "keywords": "semi-supervised learning;image recognition;sequence tagging;dependency parsing", "primary_area": "", "supplementary_material": "", "author": "Kevin Clark;Thang Luong;Quoc V. Le", "authorids": "kevclark@cs.stanford.edu;qvl@google.com;thangluong@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nclark2018crossview,\ntitle={Cross-View Training for Semi-Supervised Learning},\nauthor={Kevin Clark and Thang Luong and Quoc V. Le},\nyear={2018},\nurl={https://openreview.net/forum?id=BJubPWZRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BJubPWZRW", "pdf_size": 0, "rating": "2;5;7", "confidence": "4;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11028415106557743188&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "BJvVbCJCb", "title": "Neural Clustering By Predicting And Copying Noise", "track": "main", "status": "Reject", "tldr": "Neural clustering without needing a number of clusters", "abstract": "We propose a neural clustering model that jointly learns both latent features and how they cluster. Unlike similar methods our model does not require a predefined number of clusters. Using a supervised approach, we agglomerate latent features towards randomly sampled targets within the same space whilst progressively removing the targets until we are left with only targets which represent cluster centroids. To show the behavior of our model across different modalities we apply our model on both text and image data and very competitive results on MNIST. Finally, we also provide results against baseline models for fashion-MNIST, the 20 newsgroups dataset, and a Twitter dataset we ourselves create.", "keywords": "unsupervised learning;clustering;deep learning", "primary_area": "", "supplementary_material": "", "author": "Sam Coope;Andrej Zukov-Gregoric;Yoram Bachrach", "authorids": "sam@digitalgenius.com;andrej@digitalgenius.com;yoram@digitalgenius.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ncoope2018neural,\ntitle={Neural Clustering By Predicting And Copying Noise},\nauthor={Sam Coope and Andrej Zukov-Gregoric and Yoram Bachrach},\nyear={2018},\nurl={https://openreview.net/forum?id=BJvVbCJCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BJvVbCJCb", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;3;4", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZVsIx57PMoYJ:scholar.google.com/&scioq=Neural+Clustering+By+Predicting+And+Copying+Noise&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "BJvWjcgAZ", "title": "Sample-Efficient Deep Reinforcement Learning via Episodic Backward Update", "track": "main", "status": "Reject", "tldr": "We propose Episodic Backward Update, a novel deep reinforcement learning algorithm which samples transitions episode by episode and updates values recursively in a backward manner to achieve fast and stable learning.", "abstract": "We propose Episodic Backward Update - a new algorithm to boost the performance of a deep reinforcement learning agent by fast reward propagation. In contrast to the conventional use of the replay memory with uniform random sampling, our agent samples a whole episode and successively propagates the value of a state into its previous states. Our computationally efficient recursive algorithm allows sparse and delayed rewards to propagate effectively throughout the sampled episode. We evaluate our algorithm on 2D MNIST Maze Environment and 49 games of the Atari 2600 Environment and show that our agent improves sample efficiency with a competitive computational cost.", "keywords": "Deep Learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Su Young Lee;Sungik Choi;Sae-Young Chung", "authorids": "sy9424@kaist.ac.kr;si_choi@kaist.ac.kr;schung@kaist.ac.kr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nyoung2018sampleefficient,\ntitle={Sample-Efficient Deep Reinforcement Learning via Episodic Backward Update},\nauthor={Su Young Lee and Sungik Choi and Sae-Young Chung},\nyear={2018},\nurl={https://openreview.net/forum?id=BJvWjcgAZ},\n}", "github": "[![github](/images/github_icon.svg) suyoung-lee/Episodic-Backward-Update](https://github.com/suyoung-lee/Episodic-Backward-Update)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BJvWjcgAZ", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 19, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 103, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4339423520544824474&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 14 }, { "id": "BJy0fcgRZ", "title": "Capturing Human Category Representations by Sampling in Deep Feature Spaces", "track": "main", "status": "Workshop", "tldr": "using deep neural networks and clever algorithms to capture human mental visual concepts", "abstract": "Understanding how people represent categories is a core problem in cognitive science, with the flexibility of human learning remaining a gold standard to which modern artificial intelligence and machine learning aspire. Decades of psychological research have yielded a variety of formal theories of categories, yet validating these theories with naturalistic stimuli remains a challenge. The problem is that human category representations cannot be directly observed and running informative experiments with naturalistic stimuli such as images requires having a workable representation of these stimuli. Deep neural networks have recently been successful in a range of computer vision tasks and provide a way to represent the features of images. In this paper, we introduce a method for estimating the structure of human categories that draws on ideas from both cognitive science and machine learning, blending human-based algorithms with state-of-the-art deep representation learners. We provide qualitative and quantitative results as a proof of concept for the feasibility of the method. Samples drawn from human distributions rival the quality of current state-of-the-art generative models and outperform alternative methods for estimating the structure of human categories.", "keywords": "category representations;psychology;cognitive science;deep neural networks", "primary_area": "", "supplementary_material": "", "author": "Joshua Peterson;Krishan Aghi;Jordan Suchow;Alexander Ku;Tom Griffiths", "authorids": "peterson.c.joshua@gmail.com;kaghi@berkeley.edu;suchow@berkeley.edu;alexku@berkeley.edu;tom_griffiths@berkeley.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\npeterson2018capturing,\ntitle={Capturing Human Category Representations by Sampling in Deep Feature Spaces},\nauthor={Joshua Peterson and Krishan Aghi and Jordan Suchow and Alexander Ku and Tom Griffiths},\nyear={2018},\nurl={https://openreview.net/forum?id=BJy0fcgRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BJy0fcgRZ", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;5;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 5, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11133683223303879024&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "id": "BJypUGZ0Z", "title": "Accelerating Neural Architecture Search using Performance Prediction", "track": "main", "status": "Workshop", "tldr": "", "abstract": "Methods for neural network hyperparameter optimization and meta-modeling are computationally expensive due to the need to train a large number of model configurations. In this paper, we show that standard frequentist regression models can predict the final performance of partially trained model configurations using features based on network architectures, hyperparameters, and time series validation performance data. We empirically show that our performance prediction models are much more effective than prominent Bayesian counterparts, are simpler to implement, and are faster to train. Our models can predict final performance in both visual classification and language modeling domains, are effective for predicting performance of drastically varying model architectures, and can even generalize between model classes. Using these prediction models, we also propose an early stopping method for hyperparameter optimization and meta-modeling, which obtains a speedup of a factor up to 6x in both hyperparameter optimization and meta-modeling. Finally, we empirically show that our early stopping method can be seamlessly incorporated into both reinforcement learning-based architecture selection algorithms and bandit based search methods. Through extensive experimentation, we empirically show our performance prediction models and early stopping algorithm are state-of-the-art in terms of prediction accuracy and speedup achieved while still identifying the optimal model configurations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bowen Baker*;Otkrist Gupta*;Ramesh Raskar;Nikhil Naik", "authorids": "bowen@mit.edu;otkrist@mit.edu;raskar@mit.edu;naik@mit.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbaker*2018accelerating,\ntitle={Accelerating Neural Architecture Search using Performance Prediction},\nauthor={Bowen Baker* and Otkrist Gupta* and Ramesh Raskar and Nikhil Naik},\nyear={2018},\nurl={https://openreview.net/forum?id=BJypUGZ0Z},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=BJypUGZ0Z)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BJypUGZ0Z", "pdf_size": 0, "rating": "4;6;6", "confidence": "3;4;5", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 446, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14535739396438847664&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "BJyy3a0Ez", "title": "Continuous Propagation: Layer-Parallel Training", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Continuous propagation is a parallel technique for training deep neural networks with batch size one at full utilization of a multiprocessor system. It enables spatially distributed computations on emerging deep learning hardware accelerators that do not impose programming limitations of contemporary GPUs. The algorithm achieves model parallelism along the depth of a deep network. The method is based on the continuous representation of the optimization process and enables sustained gradient generation during all phases of computation. We demonstrate that in addition to its increased concurrency, continuous propagation improves the convergence rate of state of the art methods while matching their accuracy. ", "keywords": "Deep Learning;Model parallelism;Learning theory", "primary_area": "", "supplementary_material": "", "author": "Michael James;Devansh Arpit;Herman Sahota;Ilya Sharapov", "authorids": "michae@cerebras.net;devansharpit@gmail.com;herman@cerebras.net;ilya@cerebras.net", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BJyy3a0Ez", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;3;4", "rating_avg": 4.0, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:fEDbPxOtHJoJ:scholar.google.com/&scioq=Continuous+Propagation:+Layer-Parallel+Training&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "Bk-ofQZRb", "title": "TD Learning with Constrained Gradients", "track": "main", "status": "Reject", "tldr": "We show that adding a constraint to TD updates stabilizes learning and allows Deep Q-learning without a target network", "abstract": "Temporal Difference Learning with function approximation is known to be unstable. Previous work like \\citet{sutton2009fast} and \\citet{sutton2009convergent} has presented alternative objectives that are stable to minimize. However, in practice, TD-learning with neural networks requires various tricks like using a target network that updates slowly \\citep{mnih2015human}. In this work we propose a constraint on the TD update that minimizes change to the target values. This constraint can be applied to the gradients of any TD objective, and can be easily applied to nonlinear function approximation. We validate this update by applying our technique to deep Q-learning, and training without a target network. We also show that adding this constraint on Baird's counterexample keeps Q-learning from diverging.", "keywords": "Reinforcement Learning;TD Learning;DQN", "primary_area": "", "supplementary_material": "", "author": "Ishan Durugkar;Peter Stone", "authorids": "ishand@cs.utexas.edu;pstone@cs.utexas.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ndurugkar2018td,\ntitle={{TD} Learning with Constrained Gradients},\nauthor={Ishan Durugkar and Peter Stone},\nyear={2018},\nurl={https://openreview.net/forum?id=Bk-ofQZRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Bk-ofQZRb", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4241701462628417007&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "Bk346Ok0W", "title": "Sensor Transformation Attention Networks", "track": "main", "status": "Reject", "tldr": "We introduce a modular multi-sensor network architecture with an attentional mechanism that enables dynamic sensor selection on real-world noisy data from CHiME-3.", "abstract": "Recent work on encoder-decoder models for sequence-to-sequence mapping has shown that integrating both temporal and spatial attentional mechanisms into neural networks increases the performance of the system substantially. We report on a new modular network architecture that applies an attentional mechanism not on temporal and spatial regions of the input, but on sensor selection for multi-sensor setups. This network called the sensor transformation attention network (STAN) is evaluated in scenarios which include the presence of natural noise or synthetic dynamic noise. We demonstrate how the attentional signal responds dynamically to changing noise levels and sensor-specific noise, leading to reduced word error rates (WERs) on both audio and visual tasks using TIDIGITS and GRID; and also on CHiME-3, a multi-microphone real-world noisy dataset. The improvement grows as more channels are corrupted as demonstrated on the CHiME-3 dataset. Moreover, the proposed STAN architecture naturally introduces a number of advantages including ease of removing sensors from existing architectures, attentional interpretability, and increased robustness to a variety of noise environments.", "keywords": "attention;sensor-selection;multi-sensor;natural noise", "primary_area": "", "supplementary_material": "", "author": "Stefan Braun;Daniel Neil;Enea Ceolini;Jithendar Anumula;Shih-Chii Liu", "authorids": "brauns@ethz.ch;daniel.l.neil@gmail.com;enea.ceolini@ini.uzh.ch;anumula@ini.uzh.ch;shih@ini.ethz.ch", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nbraun2018sensor,\ntitle={Sensor Transformation Attention Networks},\nauthor={Stefan Braun and Daniel Neil and Enea Ceolini and Jithendar Anumula and Shih-Chii Liu},\nyear={2018},\nurl={https://openreview.net/forum?id=Bk346Ok0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Bk346Ok0W", "pdf_size": 0, "rating": "3;4;7", "confidence": "4;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:x6lW8nPyl18J:scholar.google.com/&scioq=Sensor+Transformation+Attention+Networks&hl=en&as_sdt=0,5", "gs_version_total": 4 }, { "id": "Bk6qQGWRb", "title": "Efficient Exploration through Bayesian Deep Q-Networks", "track": "main", "status": "Reject", "tldr": "Using Bayesian regression to estimate the posterior over Q-functions and deploy Thompson Sampling as a targeted exploration strategy with efficient trade-off the exploration and exploitation", "abstract": "We propose Bayesian Deep Q-Network (BDQN), a practical Thompson sampling based Reinforcement Learning (RL) Algorithm. Thompson sampling allows for targeted exploration in high dimensions through posterior sampling but is usually computationally expensive. We address this limitation by introducing uncertainty only at the output layer of the network through a Bayesian Linear Regression (BLR) model, which can be trained with fast closed-form updates and its samples can be drawn efficiently through the Gaussian distribution. We apply our method to a wide range of Atari Arcade Learning Environments. Since BDQN carries out more efficient exploration, it is able to reach higher rewards substantially faster than a key baseline, DDQN.", "keywords": "Deep RL;Thompson Sampling;Posterior update", "primary_area": "", "supplementary_material": "", "author": "Kamyar Azizzadenesheli;Emma Brunskill;Animashree Anandkumar", "authorids": "kazizzad@uci.edu;ebrun@cs.stanford.edu;animakumar@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nazizzadenesheli2018efficient,\ntitle={Efficient Exploration through Bayesian Deep Q-Networks},\nauthor={Kamyar Azizzadenesheli and Emma Brunskill and Animashree Anandkumar},\nyear={2018},\nurl={https://openreview.net/forum?id=Bk6qQGWRb},\n}", "github": "[![github](/images/github_icon.svg) kazizzad/BDQN-MxNet-Gluon](https://github.com/kazizzad/BDQN-MxNet-Gluon)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Bk6qQGWRb", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 20, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 220, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13679807550374293657&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13 }, { "id": "Bk7wvW-C-", "title": "Exploring Asymmetric Encoder-Decoder Structure for Context-based Sentence Representation Learning", "track": "main", "status": "Reject", "tldr": "We proposed an RNN-CNN encoder-decoder model for fast unsupervised sentence representation learning.", "abstract": "Context information plays an important role in human language understanding, and it is also useful for machines to learn vector representations of language. In this paper, we explore an asymmetric encoder-decoder structure for unsupervised context-based sentence representation learning. As a result, we build an encoder-decoder architecture with an RNN encoder and a CNN decoder, and we show that neither an autoregressive decoder nor an RNN decoder is required. We further combine a suite of effective designs to significantly improve model efficiency while also achieving better performance. Our model is trained on two different large unlabeled corpora, and in both cases transferability is evaluated on a set of downstream language understanding tasks. We empirically show that our model is simple and fast while producing rich sentence representations that excel in downstream tasks.", "keywords": "asymmetric structure;RNN-CNN;fast;unsupervised;representation;sentence", "primary_area": "", "supplementary_material": "", "author": "Shuai Tang;Hailin Jin;Chen Fang;Zhaowen Wang;Virginia R. de Sa", "authorids": "shuaitang93@ucsd.edu;hljin@adobe.com;cfang@adobe.com;zhawang@adobe.com;desa@ucsd.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ntang2018exploring,\ntitle={Exploring Asymmetric Encoder-Decoder Structure for Context-based Sentence Representation Learning},\nauthor={Shuai Tang and Hailin Jin and Chen Fang and Zhaowen Wang and Virginia R. de Sa},\nyear={2018},\nurl={https://openreview.net/forum?id=Bk7wvW-C-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Bk7wvW-C-", "pdf_size": 0, "rating": "3;6;7", "confidence": "5;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 13, "authors#_avg": 5, "corr_rating_confidence": -0.9707253433941508, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17459877656362444136&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "title": "Eigenoption Discovery through the Deep Successor Representation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/201", "id": "Bk8ZcAxR-", "author_site": "Marlos C. Machado, Clemens Rosenbaum, Xiaoxiao Guo, Miao Liu, Gerald Tesauro, Murray Campbell", "tldr": "We show how we can use the successor representation to discover eigenoptions in stochastic domains, from raw pixels. Eigenoptions are options learned to navigate the latent dimensions of a learned representation.", "abstract": "Options in reinforcement learning allow agents to hierarchically decompose a task into subtasks, having the potential to speed up learning and planning. However, autonomously learning effective sets of options is still a major challenge in the field. In this paper we focus on the recently introduced idea of using representation learning methods to guide the option discovery process. Specifically, we look at eigenoptions, options obtained from representations that encode diffusive information flow in the environment. We extend the existing algorithms for eigenoption discovery to settings with stochastic transitions and in which handcrafted features are not available. We propose an algorithm that discovers eigenoptions while learning non-linear state representations from raw pixels. It exploits recent successes in the deep reinforcement learning literature and the equivalence between proto-value functions and the successor representation. We use traditional tabular domains to provide intuition about our approach and Atari 2600 games to demonstrate its potential.", "keywords": "reinforcement learning;options;successor representation;proto-value functions;Atari;Arcade Learning Environment", "primary_area": "", "supplementary_material": "", "author": "Marlos C. Machado;Clemens Rosenbaum;Xiaoxiao Guo;Miao Liu;Gerald Tesauro;Murray Campbell", "authorids": "machado@ualberta.ca;crosenbaum@umass.edu;xiaoxiao.guo@ibm.com;miao.liu1@ibm.com;gtesauro@us.ibm.com;mcam@us.ibm.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nc.2018eigenoption,\ntitle={Eigenoption Discovery through the Deep Successor Representation},\nauthor={Marlos C. Machado and Clemens Rosenbaum and Xiaoxiao Guo and Miao Liu and Gerald Tesauro and Murray Campbell},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Bk8ZcAxR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;9", "confidence": "3;4;5", "rating_avg": 7.333333333333333, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 6, "corr_rating_confidence": 0.9819805060619659, "gs_citation": 194, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5624122668049451375&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=Bk8ZcAxR-", "pdf": "https://openreview.net/pdf?id=Bk8ZcAxR-", "email": ";;;;;", "author_num": 6 }, { "id": "Bk9nkMa4G", "title": "Bayesian Embeddings for Long-Tailed Datasets", "track": "main", "status": "Withdraw", "tldr": "Approach to improve classification accuracy on classes in the tail.", "abstract": "The statistics of the real visual world presents a long-tailed distribution: a few classes have significantly more training instances than the remaining classes in a dataset. This is because the real visual world has a few classes that are common while others are rare. Unfortunately, the performance of a convolutional neural network is typically unsatisfactory when trained using a long-tailed dataset. To alleviate this issue, we propose a method that discriminatively learns an embedding in which a simple Bayesian classifier can balance the class-priors to generalize well for rare classes. To this end, the proposed approach uses a Gaussian mixture model to factor out class-likelihoods and class-priors in a long-tailed dataset. The proposed method is simple and easy-to-implement in existing deep learning frameworks. Experiments on publicly available datasets show that the proposed approach improves the performance on classes with few training instances, while maintaining a comparable performance to the state-of-the-art on classes with abundant training examples.", "keywords": "Long-tail datasets;Imbalanced datasets", "primary_area": "", "supplementary_material": "", "author": "Victor Fragoso;Deva Ramanan", "authorids": "victor.fragoso@mail.wvu.edu;deva@andrew.cmu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Bk9nkMa4G", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;4;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15396862000059148794&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Neural Map: Structured Memory for Deep Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/196", "id": "Bk9zbyZCZ", "author_site": "Emilio Parisotto, Ruslan Salakhutdinov", "tldr": "", "abstract": "A critical component to enabling intelligent reasoning in partially observable environments is memory. Despite this importance, Deep Reinforcement Learning (DRL) agents have so far used relatively simple memory architectures, with the main methods to overcome partial observability being either a temporal convolution over the past k frames or an LSTM layer. More recent work (Oh et al., 2016) has went beyond these architectures by using memory networks which can allow more sophisticated addressing schemes over the past k frames. But even these architectures are unsatisfactory due to the reason that they are limited to only remembering information from the last k frames. In this paper, we develop a memory system with an adaptable write operator that is customized to the sorts of 3D environments that DRL agents typically interact with. This architecture, called the Neural Map, uses a spatially structured 2D memory image to learn to store arbitrary information about the environment over long time lags. We demonstrate empirically that the Neural Map surpasses previous DRL memories on a set of challenging 2D and 3D maze environments and show that it is capable of generalizing to environments that were not seen during training. ", "keywords": "deep reinforcement learning;deep learning;memory", "primary_area": "", "supplementary_material": "", "author": "Emilio Parisotto;Ruslan Salakhutdinov", "authorids": "eparisot@andrew.cmu.edu;rsalakhu@cs.cmu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nparisotto2018,\ntitle={ Neural Map: Structured Memory for Deep Reinforcement Learning},\nauthor={Emilio Parisotto and Ruslan Salakhutdinov},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Bk9zbyZCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;9", "confidence": "5;4;5", "rating_avg": 7.333333333333333, "confidence_avg": 4.666666666666667, "replies_avg": 13, "authors#_avg": 2, "corr_rating_confidence": 0.18898223650461357, "gs_citation": 320, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14158098098420736883&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=Bk9zbyZCZ", "pdf": "https://openreview.net/pdf?id=Bk9zbyZCZ", "email": ";", "author_num": 2 }, { "id": "BkA7gfZAb", "title": "Stable Distribution Alignment Using the Dual of the Adversarial Distance", "track": "main", "status": "Workshop", "tldr": " We propose a dual version of the logistic adversarial distance for feature alignment and show that it yields more stable gradient step iterations than the min-max objective.", "abstract": "Methods that align distributions by minimizing an adversarial distance between them have recently achieved impressive results. However, these approaches are difficult to optimize with gradient descent and they often do not converge well without careful hyperparameter tuning and proper initialization. We investigate whether turning the adversarial min-max problem into an optimization problem by replacing the maximization part with its dual improves the quality of the resulting alignment and explore its connections to Maximum Mean Discrepancy. Our empirical results suggest that using the dual formulation for the restricted family of linear discriminators results in a more stable convergence to a desirable solution when compared with the performance of a primal min-max GAN-like objective and an MMD objective under the same restrictions. We test our hypothesis on the problem of aligning two synthetic point clouds on a plane and on a real-image domain adaptation problem on digits. In both cases, the dual formulation yields an iterative procedure that gives more stable and monotonic improvement over time.", "keywords": "domain adaptation;adversarial networks;statistical distance;duality", "primary_area": "", "supplementary_material": "", "author": "Ben Usman;Kate Saenko;Brian Kulis", "authorids": "usmn@bu.edu;saenko@bu.edu;bkulis@bu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nusman2018stable,\ntitle={Stable Distribution Alignment Using the Dual of the Adversarial Distance},\nauthor={Ben Usman and Kate Saenko and Brian Kulis},\nyear={2018},\nurl={https://openreview.net/forum?id=BkA7gfZAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BkA7gfZAb", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12132112581759253285&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "BkBCjzp7G", "title": "Accelerating Convolutional Neural Networks using Iterative Two-Pass Decomposition", "track": "main", "status": "Withdraw", "tldr": "We present the iterative two-pass CP decomposition flow to effectively accelerate existing convolutional neural networks (CNNs).", "abstract": "We present the iterative two-pass decomposition flow to accelerate existing convolutional neural networks (CNNs). The proposed rank selection algorithm can effectively determine the proper ranks of the target convolutional layers for the low rank approximation. Our two-pass CP-decomposition helps prevent from the instability problem. The iterative flow makes the decomposition of the deeper networks systematic. The experiment results shows that VGG16 can be accelerated with a 6.2x measured speedup while the accuracy drop remains only 1.2%.\n", "keywords": "Convolutional Neural Networks;CNN;CP Decomposition;Low Rank Approximation", "primary_area": "", "supplementary_material": "", "author": "Wei-Shiang Lin;Hao-Ning Wu;Chih-Tsun Huang", "authorids": "weishianglin1993@gmail.com;wuhoward2002@gmail.com;cthuang@cs.nthu.edu.tw", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BkBCjzp7G", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;4;4", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 3, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6311364693765790192&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "BkCV_W-AZ", "title": "Regret Minimization for Partially Observable Deep Reinforcement Learning", "track": "main", "status": "Workshop", "tldr": "Advantage-based regret minimization is a new deep reinforcement learning algorithm that is particularly effective on partially observable tasks, such as 1st person navigation in Doom and Minecraft.", "abstract": "Deep reinforcement learning algorithms that estimate state and state-action value functions have been shown to be effective in a variety of challenging domains, including learning control strategies from raw image pixels. However, algorithms that estimate state and state-action value functions typically assume a fully observed state and must compensate for partial or non-Markovian observations by using finite-length frame-history observations or recurrent networks. In this work, we propose a new deep reinforcement learning algorithm based on counterfactual regret minimization that iteratively updates an approximation to a cumulative clipped advantage function and is robust to partially observed state. We demonstrate that on several partially observed reinforcement learning tasks, this new class of algorithms can substantially outperform strong baseline methods: on Pong with single-frame observations, and on the challenging Doom (ViZDoom) and Minecraft (Malm\u00f6) first-person navigation benchmarks.", "keywords": "deep reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Peter H. Jin;Sergey Levine;Kurt Keutzer", "authorids": "phj@eecs.berkeley.edu;svlevine@eecs.berkeley.edu;keutzer@berkeley.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nh.2018regret,\ntitle={Regret Minimization for Partially Observable Deep Reinforcement Learning},\nauthor={Peter H. Jin and Sergey Levine and Kurt Keutzer},\nyear={2018},\nurl={https://openreview.net/forum?id=BkCV_W-AZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BkCV_W-AZ", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;5;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": -0.18898223650461357, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=43031357070841216&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "BkDB51WR-", "title": "Learning temporal evolution of probability distribution with Recurrent Neural Network", "track": "main", "status": "Reject", "tldr": "Proposed RNN-based algorithm to estimate predictive distribution in one- and multi-step forecasts in time series prediction problems", "abstract": "We propose to tackle a time series regression problem by computing temporal evolution of a probability density function to provide a probabilistic forecast. A Recurrent Neural Network (RNN) based model is employed to learn a nonlinear operator for temporal evolution of a probability density function. We use a softmax layer for a numerical discretization of a smooth probability density functions, which transforms a function approximation problem to a classification task. Explicit and implicit regularization strategies are introduced to impose a smoothness condition on the estimated probability distribution. A Monte Carlo procedure to compute the temporal evolution of the distribution for a multiple-step forecast is presented. The evaluation of the proposed algorithm on three synthetic and two real data sets shows advantage over the compared baselines.", "keywords": "predictive distribution estimation;probabilistic RNN;uncertainty in time series prediction", "primary_area": "", "supplementary_material": "", "author": "Kyongmin Yeo;Igor Melnyk;Nam Nguyen;Eun Kyung Lee", "authorids": "kyeo@us.ibm.com;igor.melnyk@ibm.com;nnguyen@us.ibm.com;eunkyung.lee@us.ibm.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nyeo2018learning,\ntitle={Learning temporal evolution of probability distribution with Recurrent Neural Network},\nauthor={Kyongmin Yeo and Igor Melnyk and Nam Nguyen and Eun Kyung Lee},\nyear={2018},\nurl={https://openreview.net/forum?id=BkDB51WR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=BkDB51WR-", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;2;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:86DnythhdpQJ:scholar.google.com/&scioq=Learning+temporal+evolution+of+probability+distribution+with+Recurrent+Neural+Network&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "BkIkkseAZ", "title": "Theoretical properties of the global optimizer of two-layer Neural Network", "track": "main", "status": "Reject", "tldr": "This paper talks about theoretical properties of first-order optimal point of two layer neural network in over-parametrized case", "abstract": "In this paper, we study the problem of optimizing a two-layer artificial neural network that best fits a training dataset. We look at this problem in the setting where the number of parameters is greater than the number of sampled points. We show that for a wide class of differentiable activation functions (this class involves most nonlinear functions and excludes piecewise linear functions), we have that arbitrary first-order optimal solutions satisfy global optimality provided the hidden layer is non-singular. We essentially show that these non-singular hidden layer matrix satisfy a ``\"good\" property for these big class of activation functions. Techniques involved in proving this result inspire us to look at a new algorithmic, where in between two gradient step of hidden layer, we add a stochastic gradient descent (SGD) step of the output layer. In this new algorithmic framework, we extend our earlier result and show that for all finite iterations the hidden layer satisfies the``good\" property mentioned earlier therefore partially explaining success of noisy gradient methods and addressing the issue of data independency of our earlier result. Both of these results are easily extended to hidden layers given by a flat matrix from that of a square matrix. Results are applicable even if network has more than one hidden layer provided all inner hidden layers are arbitrary, satisfy non-singularity, all activations are from the given class of differentiable functions and optimization is only with respect to the outermost hidden layer. Separately, we also study the smoothness properties of the objective function and show that it is actually Lipschitz smooth, i.e., its gradients do not change sharply. We use smoothness properties to guarantee asymptotic convergence of $O(1/\\text{number of iterations})$ to a first-order optimal solution.", "keywords": "Non-convex optimization;Two-layer Neural Network;global optimality;first-order optimality", "primary_area": "", "supplementary_material": "", "author": "Digvijay Boob;Guanghui Lan", "authorids": "digvijaybb40@gatech.edu;george.lan@isye.gatech.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nboob2018theoretical,\ntitle={Theoretical properties of the global optimizer of two-layer Neural Network},\nauthor={Digvijay Boob and Guanghui Lan},\nyear={2018},\nurl={https://openreview.net/forum?id=BkIkkseAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BkIkkseAZ", "pdf_size": 0, "rating": "4;7;7", "confidence": "5;4;5", "rating_avg": 6.0, "confidence_avg": 4.666666666666667, "replies_avg": 14, "authors#_avg": 2, "corr_rating_confidence": -0.5, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=847997177018516961&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "title": "Defense-GAN: Protecting Classifiers Against Adversarial Attacks Using Generative Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/113", "id": "BkJ3ibb0-", "author_site": "Pouya Samangouei, Maya Kabkab, Rama Chellappa", "tldr": "Defense-GAN uses a Generative Adversarial Network to defend against white-box and black-box attacks in classification models.", "abstract": "In recent years, deep neural network approaches have been widely adopted for machine learning tasks, including classification. However, they were shown to be vulnerable to adversarial perturbations: carefully crafted small perturbations can cause misclassification of legitimate images. We propose Defense-GAN, a new framework leveraging the expressive capability of generative models to defend deep neural networks against such attacks. Defense-GAN is trained to model the distribution of unperturbed images. At inference time, it finds a close output to a given image which does not contain the adversarial changes. This output is then fed to the classifier. Our proposed method can be used with any classification model and does not modify the classifier structure or training procedure. It can also be used as a defense against any attack as it does not assume knowledge of the process for generating the adversarial examples. We empirically show that Defense-GAN is consistently effective against different attack methods and improves on existing defense strategies.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pouya Samangouei;Maya Kabkab;Rama Chellappa", "authorids": "pouya@umiacs.umd.edu;mayak@umiacs.umd.edu;rama@umiacs.umd.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nsamangouei2018defensegan,\ntitle={Defense-{GAN}: Protecting Classifiers Against Adversarial Attacks Using Generative Models},\nauthor={Pouya Samangouei and Maya Kabkab and Rama Chellappa},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BkJ3ibb0-},\n}", "github": "[![github](/images/github_icon.svg) kabkabm/defensegan](https://github.com/kabkabm/defensegan) + [![Papers with Code](/images/pwc_icon.svg) 4 community implementations](https://paperswithcode.com/paper/?openreview=BkJ3ibb0-)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;8", "confidence": "3;3;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 17, "authors#_avg": 3, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 1543, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4356922002684962280&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=BkJ3ibb0-", "pdf": "https://openreview.net/pdf?id=BkJ3ibb0-", "email": ";;", "author_num": 3 }, { "title": "Improving GAN Training via Binarized Representation Entropy (BRE) Regularization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/24", "id": "BkLhaGZRW", "author_site": "Yanshuai Cao, Gavin Weiguang Ding, Yik Chau Lui, Ruitong Huang", "tldr": "", "abstract": "We propose a novel regularizer to improve the training of Generative Adversarial Networks (GANs). The motivation is that when the discriminator D spreads out its model capacity in the right way, the learning signals given to the generator G are more informative and diverse, which helps G to explore better and discover the real data manifold while avoiding large unstable jumps due to the erroneous extrapolation made by D . Our regularizer guides the rectifier discriminator D to better allocate its model capacity, by encouraging the binary activation patterns on selected internal layers of D to have a high joint entropy. Experimental results on both synthetic data and real datasets demonstrate improvements in stability and convergence speed of the GAN training, as well as higher sample quality. The approach also leads to higher classification accuracies in semi-supervised learning.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yanshuai Cao;Gavin Weiguang Ding;Kry Yik-Chau Lui;Ruitong Huang", "authorids": "yanshuai.cao@borealisai.com;gavin.ding@borealisai.com;yikchau.y.lui@borealisai.com;ruitong.huang@borealisai.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ncao2018improving,\ntitle={Improving {GAN} Training via Binarized Representation Entropy ({BRE}) Regularization},\nauthor={Yanshuai Cao and Gavin Weiguang Ding and Kry Yik-Chau Lui and Ruitong Huang},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BkLhaGZRW},\n}", "github": "[![github](/images/github_icon.svg) BorealisAI/bre-gan](https://github.com/BorealisAI/bre-gan)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "4;6;7", "confidence": "3;3;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 0.7559289460184545, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14467671840316463321&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=BkLhaGZRW", "pdf": "https://openreview.net/pdf?id=BkLhaGZRW", "email": ";;;", "author_num": 4 }, { "id": "BkM27IxR-", "title": "Learning to Optimize Neural Nets", "track": "main", "status": "Reject", "tldr": "We learn an optimization algorithm that generalizes to unseen tasks", "abstract": "Learning to Optimize is a recently proposed framework for learning optimization algorithms using reinforcement learning. In this paper, we explore learning an optimization algorithm for training shallow neural nets. Such high-dimensional stochastic optimization problems present interesting challenges for existing reinforcement learning algorithms. We develop an extension that is suited to learning optimization algorithms in this setting and demonstrate that the learned optimization algorithm consistently outperforms other known optimization algorithms even on unseen tasks and is robust to changes in stochasticity of gradients and the neural net architecture. More specifically, we show that an optimization algorithm trained with the proposed method on the problem of training a neural net on MNIST generalizes to the problems of training neural nets on the Toronto Faces Dataset, CIFAR-10 and CIFAR-100. ", "keywords": "Learning to learn;meta-learning;reinforcement learning;optimization", "primary_area": "", "supplementary_material": "", "author": "Ke Li;Jitendra Malik", "authorids": "ke.li@eecs.berkeley.edu;jitendram@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nli2018learning,\ntitle={Learning to Optimize Neural Nets},\nauthor={Ke Li and Jitendra Malik},\nyear={2018},\nurl={https://openreview.net/forum?id=BkM27IxR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BkM27IxR-", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;4;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 165, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5962206971778023822&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "BkM3ibZRW", "title": "Adversarially Regularized Autoencoders", "track": "main", "status": "Workshop", "tldr": "Adversarially Regularized Autoencoders learn smooth representations of discrete structures allowing for interesting results in text generation, such as unaligned style transfer, semi-supervised learning, and latent space interpolation and arithmetic.", "abstract": "While autoencoders are a key technique in representation learning for continuous structures, such as images or wave forms, developing general-purpose autoencoders for discrete structures, such as text sequence or discretized images, has proven to be more challenging. In particular, discrete inputs make it more difficult to learn a smooth encoder that preserves the complex local relationships in the input space. In this work, we propose an adversarially regularized autoencoder (ARAE) with the goal of learning more robust discrete-space representations. ARAE jointly trains both a rich discrete-space encoder, such as an RNN, and a simpler continuous space generator function, while using generative adversarial network (GAN) training to constrain the distributions to be similar. This method yields a smoother contracted code space that maps similar inputs to nearby codes, and also an implicit latent variable GAN model for generation. Experiments on text and discretized images demonstrate that the GAN model produces clean interpolations and captures the multimodality of the original space, and that the autoencoder produces improvements in semi-supervised learning as well as state-of-the-art results in unaligned text style transfer task using only a shared continuous-space representation.", "keywords": "representation learning;natural language generation;discrete structure modeling;adversarial training;unaligned text style-transfer", "primary_area": "", "supplementary_material": "", "author": "Junbo (Jake) Zhao;Yoon Kim;Kelly Zhang;Alexander M. Rush;Yann LeCun", "authorids": "jakezhao@cs.nyu.edu;yoonkim@seas.harvard.edu;kz918@nyu.edu;srush@seas.harvard.edu;yann@cs.nyu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\n(jake)2018adversarially,\ntitle={Adversarially Regularized Autoencoders},\nauthor={Junbo (Jake) Zhao and Yoon Kim and Kelly Zhang and Alexander M. Rush and Yann LeCun},\nyear={2018},\nurl={https://openreview.net/forum?id=BkM3ibZRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BkM3ibZRW", "pdf_size": 0, "rating": "3;5;6;9", "confidence": "4;4;3;3", "rating_avg": 5.75, "confidence_avg": 3.5, "replies_avg": 12, "authors#_avg": 5, "corr_rating_confidence": -0.808290376865476, "gs_citation": 390, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5024716526871945774&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "title": "Identifying Analogies Across Domains", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/226", "id": "BkN_r2lR-", "author_site": "Yedid Hoshen, Lior Wolf", "tldr": "Finding correspondences between domains by performing matching/mapping iterations", "abstract": "Identifying analogies across domains without supervision is a key task for artificial intelligence. Recent advances in cross domain image mapping have concentrated on translating images across domains. Although the progress made is impressive, the visual fidelity many times does not suffice for identifying the matching sample from the other domain. In this paper, we tackle this very task of finding exact analogies between datasets i.e. for every image from domain A find an analogous image in domain B. We present a matching-by-synthesis approach: AN-GAN, and show that it outperforms current techniques. We further show that the cross-domain mapping task can be broken into two parts: domain alignment and learning the mapping function. The tasks can be iteratively solved, and as the alignment is improved, the unsupervised translation function reaches quality comparable to full supervision. ", "keywords": "unsupervised mapping;cross domain mapping", "primary_area": "", "supplementary_material": "", "author": "Yedid Hoshen;Lior Wolf", "authorids": "yedidh@fb.com;wolf@fb.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nhoshen2018identifying,\ntitle={Identifying Analogies Across Domains},\nauthor={Yedid Hoshen and Lior Wolf},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BkN_r2lR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "4;5;7", "confidence": "3;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": 0.7559289460184545, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14041517444714750081&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=BkN_r2lR-", "pdf": "https://openreview.net/pdf?id=BkN_r2lR-", "email": ";", "author_num": 2 }, { "id": "BkPrDFgR-", "title": "Piecewise Linear Neural Networks verification: A comparative study", "track": "main", "status": "Reject", "tldr": "", "abstract": "The success of Deep Learning and its potential use in many important safety-\ncritical applications has motivated research on formal verification of Neural Net-\nwork (NN) models. Despite the reputation of learned NN models to behave as\nblack boxes and theoretical hardness results of the problem of proving their prop-\nerties, researchers have been successful in verifying some classes of models by\nexploiting their piecewise linear structure. Unfortunately, most of these works\ntest their algorithms on their own models and do not offer any comparison with\nother approaches. As a result, the advantages and downsides of the different al-\ngorithms are not well understood. Motivated by the need of accelerating progress\nin this very important area, we investigate the trade-offs of a number of different\napproaches based on Mixed Integer Programming, Satisfiability Modulo Theory,\nas well as a novel method based on the Branch-and-Bound framework. We also\npropose a new data set of benchmarks, in addition to a collection of previously\nreleased testcases that can be used to compare existing methods. Our analysis not\nonly allowed a comparison to be made between different strategies, the compar-\nision of results from different solvers also revealed implementation bugs in pub-\nlished methods. We expect that the availability of our benchmark and the analysis\nof the different approaches will allow researchers to invent and evaluate promising\napproaches for making progress on this important topic.", "keywords": "Verification;SMT solver;Mixed Integer Programming;Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Rudy Bunel;Ilker Turkaslan;Philip H.S. Torr;Pushmeet Kohli;M. Pawan Kumar", "authorids": "rudy@robots.ox.ac.uk;ilker.turkaslan@lmh.ox.ac.uk;philip.torr@eng.ox.ac.uk;pushmeet@google.com;pawan@robots.ox.ac.uk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nbunel2018piecewise,\ntitle={Piecewise Linear Neural Networks verification: A comparative study},\nauthor={Rudy Bunel and Ilker Turkaslan and Philip H.S. Torr and Pushmeet Kohli and M. Pawan Kumar},\nyear={2018},\nurl={https://openreview.net/forum?id=BkPrDFgR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BkPrDFgR-", "pdf_size": 0, "rating": "3;5;6", "confidence": "5;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 5, "corr_rating_confidence": -0.9819805060619659, "gs_citation": 77, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=484731605424360306&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "BkQCGzZ0-", "title": "Discrete Autoencoders for Sequence Models", "track": "main", "status": "Reject", "tldr": "Autoencoders for text with a new method for using discrete latent space.", "abstract": "Recurrent models for sequences have been recently successful at many tasks, especially for language modeling\nand machine translation. Nevertheless, it remains challenging to extract good representations from\nthese models. For instance, even though language has a clear hierarchical structure going from characters\nthrough words to sentences, it is not apparent in current language models.\nWe propose to improve the representation in sequence models by\naugmenting current approaches with an autoencoder that is forced to compress\nthe sequence through an intermediate discrete latent space. In order to propagate gradients\nthough this discrete representation we introduce an improved semantic hashing technique.\nWe show that this technique performs well on a newly proposed quantitative efficiency measure.\nWe also analyze latent codes produced by the model showing how they correspond to\nwords and phrases. Finally, we present an application of the autoencoder-augmented\nmodel to generating diverse translations.", "keywords": "autoencoders;sequence models;discrete representations", "primary_area": "", "supplementary_material": "", "author": "Lukasz Kaiser;Samy Bengio", "authorids": "lukaszkaiser@google.com;bengio@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkaiser2018discrete,\ntitle={Discrete Autoencoders for Sequence Models},\nauthor={Lukasz Kaiser and Samy Bengio},\nyear={2018},\nurl={https://openreview.net/forum?id=BkQCGzZ0-},\n}", "github": "[![github](/images/github_icon.svg) tensorflow/tensor2tensor](https://github.com/tensorflow/tensor2tensor) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=BkQCGzZ0-)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BkQCGzZ0-", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;5;1", "rating_avg": 5.0, "confidence_avg": 3.3333333333333335, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": -0.720576692122892, "gs_citation": 67, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15984576751063995528&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "title": "Variational Continual Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/199", "id": "BkQqq0gRb", "author_site": "Viet Cuong Nguyen, Yingzhen Li, Thang Bui, Richard E Turner", "tldr": "This paper develops a principled method for continual learning in deep models.", "abstract": "This paper develops variational continual learning (VCL), a simple but general framework for continual learning that fuses online variational inference (VI) and recent advances in Monte Carlo VI for neural networks. The framework can successfully train both deep discriminative models and deep generative models in complex continual learning settings where existing tasks evolve over time and entirely new tasks emerge. Experimental results show that VCL outperforms state-of-the-art continual learning methods on a variety of tasks, avoiding catastrophic forgetting in a fully automatic way.", "keywords": "continual learning;online variational inference", "primary_area": "", "supplementary_material": "", "author": "Cuong V. Nguyen;Yingzhen Li;Thang D. Bui;Richard E. Turner", "authorids": "vcn22@cam.ac.uk;yl494@cam.ac.uk;tdb40@cam.ac.uk;ret26@cam.ac.uk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nv.2018variational,\ntitle={Variational Continual Learning},\nauthor={Cuong V. Nguyen and Yingzhen Li and Thang D. Bui and Richard E. Turner},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BkQqq0gRb},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 8 community implementations](https://paperswithcode.com/paper/?openreview=BkQqq0gRb)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;6;6", "confidence": "2;3;4", "rating_avg": 6.0, "confidence_avg": 3.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 904, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2605805270470223876&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=BkQqq0gRb", "pdf": "https://openreview.net/pdf?id=BkQqq0gRb", "email": ";;;", "author_num": 4 }, { "id": "BkS3fnl0W", "title": "Semi-supervised Outlier Detection using Generative And Adversary Framework", "track": "main", "status": "Reject", "tldr": "", "abstract": "In a conventional binary/multi-class classification task, the decision boundary is supported by data from two or more classes. However, in one-class classification task, only data from one class are available. To build an robust outlier detector using only data from a positive class, we propose a corrupted GAN(CorGAN), a deep convolutional Generative Adversary Network requiring no convergence during training. In the adversarial process of training CorGAN, the Generator is supposed to generate outlier samples for negative class, and the Discriminator as an one-class classifier is trained to distinguish data from training datasets (i.e. positive class) and generated data from the Generator (i.e. negative class). To improve the performance of the Discriminator (one-class classifier), we also propose a lot of techniques to improve the performance of the model. The proposed model outperforms the traditional method PCA + PSVM and the solution based on Autoencoder.", "keywords": "Semi-supervised Learning;Generative And Adversary Framework;One-class classification;Outlier detection", "primary_area": "", "supplementary_material": "", "author": "Jindong Gu;Matthias Schubert;Volker Tresp", "authorids": "jindong.gu@siemens.com;schubert@dbs.ifi.lmu.de;volker.tresp@siemens.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ngu2018semisupervised,\ntitle={Semi-supervised Outlier Detection using Generative And Adversary Framework},\nauthor={Jindong Gu and Matthias Schubert and Volker Tresp},\nyear={2018},\nurl={https://openreview.net/forum?id=BkS3fnl0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BkS3fnl0W", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;4;3", "rating_avg": 3.6666666666666665, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2700917923948554069&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "A New Method of Region Embedding for Text Classification", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/325", "id": "BkSDMA36Z", "author_site": "Chao Qiao, Bo Huang, Guocheng Niu, daren li, daxiang dong, wei he, Dianhai Yu, hua wu", "tldr": "", "abstract": "To represent a text as a bag of properly identified \u201cphrases\u201d and use the representation for processing the text is proved to be useful. The key question here is how to identify the phrases and represent them. The traditional method of utilizing n-grams can be regarded as an approximation of the approach. Such a method can suffer from data sparsity, however, particularly when the length of n-gram is large. In this paper, we propose a new method of learning and utilizing task-specific distributed representations of n-grams, referred to as \u201cregion embeddings\u201d. Without loss of generality we address text classification. We specifically propose two models for region embeddings. In our models, the representation of a word has two parts, the embedding of the word itself, and a weighting matrix to interact with the local context, referred to as local context unit. The region embeddings are learned and used in the classification task, as parameters of the neural network classifier. Experimental results show that our proposed method outperforms existing methods in text classification on several benchmark datasets. The results also indicate that our method can indeed capture the salient phrasal expressions in the texts.", "keywords": "region embedding;local context unit;text classification", "primary_area": "", "supplementary_material": "", "author": "chao qiao;bo huang;guocheng niu;daren li;daxiang dong;wei he;dianhai yu;hua wu", "authorids": "chao.qiao@outlook.com;bohuang0321@gmail.com;niuguocheng@baidu.com;lidaren@baidu.com;dongdaxiang@baidu.com;hewei06@baidu.com;yudianhai@baidu.com;wu_hua@baidu.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\nqiao2018a,\ntitle={A New Method of Region Embedding for Text Classification},\nauthor={chao qiao and bo huang and guocheng niu and daren li and daxiang dong and wei he and dianhai yu and hua wu},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BkSDMA36Z},\n}", "github": "[![github](/images/github_icon.svg) text-representation/local-context-unit](https://github.com/text-representation/local-context-unit)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;6", "confidence": "5;3;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 26, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4730426859617818868&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=BkSDMA36Z", "pdf": "https://openreview.net/pdf?id=BkSDMA36Z", "email": ";;;;;;;", "author_num": 8 }, { "id": "BkTQ8UckG", "title": "VSE++: Improving Visual-Semantic Embeddings with Hard Negatives", "track": "main", "status": "Withdraw", "tldr": "A new loss based on relatively hard negatives that achieves state-of-the-art performance in image-caption retrieval.", "abstract": "We present a new technique for learning visual-semantic embeddings for cross-modal retrieval. Inspired by the use of hard negatives in structured prediction, and ranking loss functions used in retrieval, we introduce a simple change to common loss functions used to learn multi-modal embeddings. That, combined with fine-tuning and the use of augmented data, yields significant gains in retrieval performance. We showcase our approach, dubbed VSE++, on the MS-COCO and Flickr30K datasets, using ablation studies and comparisons with existing methods. On MS-COCO our approach outperforms state-of-the-art methods by 8.8% in caption retrieval, and 11.3% in image retrieval (based on R@1).", "keywords": "Joint embeddings;Hard Negatives;Visual-semantic embeddings;Cross-modal retrieval;Ranking", "primary_area": "", "supplementary_material": "", "author": "Fartash Faghri;David J. Fleet;Jamie Ryan Kiros;Sanja Fidler", "authorids": "faghri@cs.toronto.edu;fleet@cs.toronto.edu;rkiros@cs.toronto.edu;fidler@cs.toronto.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1", "site": "https://openreview.net/forum?id=BkTQ8UckG", "pdf_size": 0, "rating": "4", "confidence": "4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 1, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 1560, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16677887724827446731&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "id": "BkUDW_lCb", "title": "Pointing Out SQL Queries From Text", "track": "main", "status": "Reject", "tldr": "We present a type-based pointer network model together with a value-based loss method to effectively train a neural model to translate natural language to SQL.", "abstract": "The digitization of data has resulted in making datasets available to millions of users in the form of relational databases and spreadsheet tables. However, a majority of these users come from diverse backgrounds and lack the programming expertise to query and analyze such tables. We present a system that allows for querying data tables using natural language questions, where the system translates the question into an executable SQL query. We use a deep sequence to sequence model in wich the decoder uses a simple type system of SQL expressions to structure the output prediction. Based on the type, the decoder either copies an output token from the input question using an attention-based copying mechanism or generates it from a fixed vocabulary. We also introduce a value-based loss function that transforms a distribution over locations to copy from into a distribution over the set of input tokens to improve training of our model. We evaluate our model on the recently released WikiSQL dataset and show that our model trained using only supervised learning significantly outperforms the current state-of-the-art Seq2SQL model that uses reinforcement learning.", "keywords": "Program Synthesis;Semantic Parsing;WikiTable;SQL;Pointer Network", "primary_area": "", "supplementary_material": "", "author": "Chenglong Wang;Marc Brockschmidt;Rishabh Singh", "authorids": "clwang@cs.washington.edu;mabrocks@microsoft.com;risin@microsoft.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nwang2018pointing,\ntitle={Pointing Out {SQL} Queries From Text},\nauthor={Chenglong Wang and Marc Brockschmidt and Rishabh Singh},\nyear={2018},\nurl={https://openreview.net/forum?id=BkUDW_lCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BkUDW_lCb", "pdf_size": 0, "rating": "3;4;7", "confidence": "4;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10844650000179860663&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "title": "Evaluating the Robustness of Neural Networks: An Extreme Value Theory Approach", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/97", "id": "BkUHlMZ0b", "author_site": "Tsui-Wei Weng, Huan Zhang, Pin-Yu Chen, Jinfeng Yi, Dong Su, Yupeng Gao, Cho-Jui Hsieh, Luca Daniel", "tldr": "We propose the first attack-independent robustness metric, a.k.a CLEVER, that can be applied to any neural network classifier.", "abstract": "The robustness of neural networks to adversarial examples has received great attention due to security implications. Despite various attack approaches to crafting visually imperceptible adversarial examples, little has been developed towards a comprehensive measure of robustness. In this paper, we provide theoretical justification for converting robustness analysis into a local Lipschitz constant estimation problem, and propose to use the Extreme Value Theory for efficient evaluation. Our analysis yields a novel robustness metric called CLEVER, which is short for Cross Lipschitz Extreme Value for nEtwork Robustness. The proposed CLEVER score is attack-agnostic and is computationally feasible for large neural networks. Experimental results on various networks, including ResNet, Inception-v3 and MobileNet, show that (i) CLEVER is aligned with the robustness indication measured by the $\\ell_2$ and $\\ell_\\infty$ norms of adversarial examples from powerful attacks, and (ii) defended networks using defensive distillation or bounded ReLU indeed give better CLEVER scores. To the best of our knowledge, CLEVER is the first attack-independent robustness metric that can be applied to any neural network classifiers.\n\n", "keywords": "robustness;adversarial machine learning;neural network;extreme value theory;adversarial example;adversarial perturbation", "primary_area": "", "supplementary_material": "", "author": "Tsui-Wei Weng*;Huan Zhang*;Pin-Yu Chen;Jinfeng Yi;Dong Su;Yupeng Gao;Cho-Jui Hsieh;Luca Daniel", "authorids": "twweng@mit.edu;ecezhang@ucdavis.edu;pin-yu.chen@ibm.com;jinfengyi.ustc@gmail.com;dong.su@ibm.com;yupeng.gao@ibm.com;chohsieh@ucdavis.edu;dluca@mit.edu", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\nweng2018evaluating,\ntitle={Evaluating the Robustness of Neural Networks: An Extreme Value Theory Approach},\nauthor={Tsui-Wei Weng and Huan Zhang and Pin-Yu Chen and Jinfeng Yi and Dong Su and Yupeng Gao and Cho-Jui Hsieh and Luca Daniel},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BkUHlMZ0b},\n}", "github": "[![github](/images/github_icon.svg) huanzhang12/CLEVER](https://github.com/huanzhang12/CLEVER)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;7;7", "confidence": "1;3;3", "rating_avg": 7.0, "confidence_avg": 2.3333333333333335, "replies_avg": 13, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 585, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2078120094241692942&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=BkUHlMZ0b", "pdf": "https://openreview.net/pdf?id=BkUHlMZ0b", "email": ";;;;;;;", "author_num": 8 }, { "title": "Boosting the Actor with Dual Critic", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/23", "id": "BkUp6GZRW", "author_site": "Bo Dai, Albert Shaw, Niao He, Lihong Li, Le Song", "tldr": "We propose Dual Actor-Critic algorithm, which is derived in a principled way from the Lagrangian dual form of the Bellman optimality equation. The algorithm achieves the state-of-the-art performances across several benchmarks.", "abstract": "This paper proposes a new actor-critic-style algorithm called Dual Actor-Critic or Dual-AC. It is derived in a principled way from the Lagrangian dual form of the Bellman optimality equation, which can be viewed as a two-player game between the actor and a critic-like function, which is named as dual critic. Compared to its actor-critic relatives, Dual-AC has the desired property that the actor and dual critic are updated cooperatively to optimize the same objective function, providing a more transparent way for learning the critic that is directly related to the objective function of the actor. We then provide a concrete algorithm that can effectively solve the minimax optimization problem, using techniques of multi-step bootstrapping, path regularization, and stochastic dual ascent algorithm. We demonstrate that the proposed algorithm achieves the state-of-the-art performances across several benchmarks.", "keywords": "reinforcement learning;actor-critic algorithm;Lagrangian duality", "primary_area": "", "supplementary_material": "", "author": "Bo Dai;Albert Shaw;Niao He;Lihong Li;Le Song", "authorids": "bohr.dai@gmail.com;ashaw596@gatech.edu;niaohe@illinois.edu;lihongli.cs@gmail.com;lsong@cc.gatech.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ndai2018boosting,\ntitle={Boosting the Actor with Dual Critic},\nauthor={Bo Dai and Albert Shaw and Niao He and Lihong Li and Le Song},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BkUp6GZRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;3;4", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7529217791241112867&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=BkUp6GZRW", "pdf": "https://openreview.net/pdf?id=BkUp6GZRW", "email": ";;;;", "author_num": 5 }, { "id": "BkVf1AeAZ", "title": "Label Embedding Network: Learning Label Representation for Soft Training of Deep Networks", "track": "main", "status": "Reject", "tldr": "Learning Label Representation for Deep Networks", "abstract": "We propose a method, called Label Embedding Network, which can learn label representation (label embedding) during the training process of deep networks. With the proposed method, the label embedding is adaptively and automatically learned through back propagation. The original one-hot represented loss function is converted into a new loss function with soft distributions, such that the originally unrelated labels have continuous interactions with each other during the training process. As a result, the trained model can achieve substantially higher accuracy and with faster convergence speed. Experimental results based on competitive tasks demonstrate the effectiveness of the proposed method, and the learned label embedding is reasonable and interpretable. The proposed method achieves comparable or even better results than the state-of-the-art systems.", "keywords": "label embedding;deep learning;label representation;computer vision;natural language processing", "primary_area": "", "supplementary_material": "", "author": "Xu Sun;Bingzhen Wei;Xuancheng Ren;Shuming Ma", "authorids": "xusun@pku.edu.cn;weibz@pku.edu.cn;renxc@pku.edu.cn;shumingma@pku.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsun2018label,\ntitle={Label Embedding Network: Learning Label Representation for Soft Training of Deep Networks},\nauthor={Xu Sun and Bingzhen Wei and Xuancheng Ren and Shuming Ma},\nyear={2018},\nurl={https://openreview.net/forum?id=BkVf1AeAZ},\n}", "github": "[![github](/images/github_icon.svg) lancopku/LabelEmb](https://github.com/lancopku/LabelEmb)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BkVf1AeAZ", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;3;5", "rating_avg": 3.6666666666666665, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3798475280646449275&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "BkVsWbbAW", "title": "Deep Generative Dual Memory Network for Continual Learning", "track": "main", "status": "Reject", "tldr": "A dual memory architecture inspired from human brain to learn sequentially incoming tasks, while averting catastrophic forgetting.", "abstract": "Despite advances in deep learning, artificial neural networks do not learn the same way as humans do. Today, neural networks can learn multiple tasks when trained on them jointly, but cannot maintain performance on learnt tasks when tasks are presented one at a time -- this phenomenon called catastrophic forgetting is a fundamental challenge to overcome before neural networks can learn continually from incoming data. In this work, we derive inspiration from human memory to develop an architecture capable of learning continuously from sequentially incoming tasks, while averting catastrophic forgetting. Specifically, our model consists of a dual memory architecture to emulate the complementary learning systems (hippocampus and the neocortex) in the human brain and maintains a consolidated long-term memory via generative replay of past experiences. We (i) substantiate our claim that replay should be generative, (ii) show the benefits of generative replay and dual memory via experiments, and (iii) demonstrate improved performance retention even for small models with low capacity. Our architecture displays many important characteristics of the human memory and provides insights on the connection between sleep and learning in humans.", "keywords": "Continual Learning;Catastrophic Forgetting;Sequential Multitask Learning;Deep Generative Models;Dual Memory Networks;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Nitin Kamra;Umang Gupta;Yan Liu", "authorids": "nkamra@usc.edu;umanggup@usc.edu;yanliu.cs@usc.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nkamra2018deep,\ntitle={Deep Generative Dual Memory Network for Continual Learning},\nauthor={Nitin Kamra and Umang Gupta and Yan Liu},\nyear={2018},\nurl={https://openreview.net/forum?id=BkVsWbbAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BkVsWbbAW", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;2", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844387, "gs_citation": 187, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=382690068125922155&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "title": "Beyond Shared Hierarchies: Deep Multitask Learning through Soft Layer Ordering", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/49", "id": "BkXmYfbAZ", "author_site": "Elliot Meyerson, Risto Miikkulainen", "tldr": "Relaxing the constraint of shared hierarchies enables more effective deep multitask learning.", "abstract": "Existing deep multitask learning (MTL) approaches align layers shared between tasks in a parallel ordering. Such an organization significantly constricts the types of shared structure that can be learned. The necessity of parallel ordering for deep MTL is first tested by comparing it with permuted ordering of shared layers. The results indicate that a flexible ordering can enable more effective sharing, thus motivating the development of a soft ordering approach, which learns how shared layers are applied in different ways for different tasks. Deep MTL with soft ordering outperforms parallel ordering methods across a series of domains. These results suggest that the power of deep MTL comes from learning highly general building blocks that can be assembled to meet the demands of each task.", "keywords": "multitask learning;deep learning;modularity", "primary_area": "", "supplementary_material": "", "author": "Elliot Meyerson;Risto Miikkulainen", "authorids": "ekm@cs.utexas.edu;risto@cs.utexas.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nmeyerson2018beyond,\ntitle={Beyond Shared Hierarchies: Deep Multitask Learning through Soft Layer Ordering},\nauthor={Elliot Meyerson and Risto Miikkulainen},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BkXmYfbAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 141, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8511828577202972238&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=BkXmYfbAZ", "pdf": "https://openreview.net/pdf?id=BkXmYfbAZ", "email": ";", "author_num": 2 }, { "id": "Bk_fs6gA-", "title": "Long Term Memory Network for Combinatorial Optimization Problems", "track": "main", "status": "Reject", "tldr": "We propose a memory network model to solve Binary LP instances where the memory information is perseved for long-term use. ", "abstract": "This paper introduces a framework for solving combinatorial optimization problems by learning from input-output examples of optimization problems. We introduce a new memory augmented neural model in which the memory is not resettable (i.e the information stored in the memory after processing an input example is kept for the next seen examples). We used deep reinforcement learning to train a memory controller agent to store useful memories. Our model was able to outperform hand-crafted solver on Binary Linear Programming (Binary LP). The proposed model is tested on different Binary LP instances with large number of variables (up to 1000 variables) and constrains (up to 700 constrains).", "keywords": "Memory Networks;Combinatorial Optimization;Binary LP", "primary_area": "", "supplementary_material": "", "author": "Hazem A. A. Nomer;Abdallah Aboutahoun;Ashraf Elsayed", "authorids": "hazemahmed@alexu.edu.eg;abdallah_aboutahoun@alexu.edu.eg;ashraf.elsayed@alexu.edu.eg", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\na.2018long,\ntitle={Long Term Memory Network for Combinatorial Optimization Problems},\nauthor={Hazem A. A. Nomer and Abdallah Aboutahoun and Ashraf Elsayed},\nyear={2018},\nurl={https://openreview.net/forum?id=Bk_fs6gA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Bk_fs6gA-", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;2;1", "rating_avg": 3.6666666666666665, "confidence_avg": 2.3333333333333335, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": -0.9449111825230679, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11524853501305749575&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Consequentialist conditional cooperation in social dilemmas with imperfect information", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/335", "id": "BkabRiQpb", "author_site": "Alex Peysakhovich, Adam Lerer", "tldr": "We show how to use deep RL to construct agents that can solve social dilemmas beyond matrix games.", "abstract": "Social dilemmas, where mutual cooperation can lead to high payoffs but participants face incentives to cheat, are ubiquitous in multi-agent interaction. We wish to construct agents that cooperate with pure cooperators, avoid exploitation by pure defectors, and incentivize cooperation from the rest. However, often the actions taken by a partner are (partially) unobserved or the consequences of individual actions are hard to predict. We show that in a large class of games good strategies can be constructed by conditioning one's behavior solely on outcomes (ie. one's past rewards). We call this consequentialist conditional cooperation. We show how to construct such strategies using deep reinforcement learning techniques and demonstrate, both analytically and experimentally, that they are effective in social dilemmas beyond simple matrix games. We also show the limitations of relying purely on consequences and discuss the need for understanding both the consequences of and the intentions behind an action.", "keywords": "deep reinforcement learning;cooperation;social dilemma;multi-agent systems", "primary_area": "", "supplementary_material": "", "author": "Alexander Peysakhovich;Adam Lerer", "authorids": "alexpeys@gmail.com;alerer@fb.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\npeysakhovich2018consequentialist,\ntitle={Consequentialist conditional cooperation in social dilemmas with imperfect information},\nauthor={Alexander Peysakhovich and Adam Lerer},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BkabRiQpb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;3", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5526042842036893933&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=BkabRiQpb", "pdf": "https://openreview.net/pdf?id=BkabRiQpb", "email": ";", "author_num": 2 }, { "id": "BkbOsNeSM", "title": "FastNorm: Improving Numerical Stability of Deep Network Training with Efficient Normalization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We propose a modification to weight normalization techniques that provides the same convergence benefits but requires fewer computational operations. The proposed method, FastNorm, exploits the low-rank properties of weight updates and infers the norms without explicitly calculating them, replacing an $O(n^2)$ computation with an $O(n)$ one for a fully-connected layer. It improves numerical stability and reduces accuracy variance enabling higher learning rate and offering better convergence. We report experimental results that illustrate the advantage of the proposed method. ", "keywords": "Neural networks;Training;Convergence", "primary_area": "", "supplementary_material": "", "author": "Sadhika Malladi;Ilya Sharapov", "authorids": "sadhika@mit.edu;ilya@cerebras.net", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BkbOsNeSM", "pdf_size": 0, "rating": "3;4;4", "confidence": "3;3;4", "rating_avg": 3.6666666666666665, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 2, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10093065733891938510&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "BkeC_J-R-", "title": "Combination of Supervised and Reinforcement Learning For Vision-Based Autonomous Control", "track": "main", "status": "Reject", "tldr": "The new combination of reinforcement and supervised learning, dramatically decreasing the number of required samples for training on video", "abstract": " Reinforcement learning methods have recently achieved impressive results on a wide range of control problems. However, especially with complex inputs, they still require an extensive amount of training data in order to converge to a meaningful solution. This limitation largely prohibits their usage for complex input spaces such as video signals, and it is still impossible to use it for a number of complex problems in a real world environments, including many of those for video based control. Supervised learning, on the contrary, is capable of learning on a relatively small number of samples, however it does not take into account reward-based control policies and is not capable to provide independent control policies. In this article we propose a model-free control method, which uses a combination of reinforcement and supervised learning for autonomous control and paves the way towards policy based control in real world environments. We use SpeedDreams/TORCS video game to demonstrate that our approach requires much less samples (hundreds of thousands against millions or tens of millions) comparing to the state-of-the-art reinforcement learning techniques on similar data, and at the same time overcomes both supervised and reinforcement learning approaches in terms of quality. Additionally, we demonstrate the applicability of the method to MuJoCo control problems. ", "keywords": "Reinforcement learning;deep learning;autonomous control", "primary_area": "", "supplementary_material": "", "author": "Dmitry Kangin;Nicolas Pugeault", "authorids": "d.kangin@exeter.ac.uk;n.pugeault@exeter.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkangin2018combination,\ntitle={Combination of Supervised and Reinforcement Learning For Vision-Based Autonomous Control},\nauthor={Dmitry Kangin and Nicolas Pugeault},\nyear={2018},\nurl={https://openreview.net/forum?id=BkeC_J-R-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BkeC_J-R-", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;5;3", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": -0.5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4120728540957570641&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Unsupervised Cipher Cracking Using Discrete GANs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/277", "id": "BkeqO7x0-", "author_site": "Aidan Gomez, Sicong(Sheldon) Huang, Ivan Zhang, Bryan Li, Muhammad Osama, Lukasz Kaiser", "tldr": "", "abstract": "This work details CipherGAN, an architecture inspired by CycleGAN used for inferring the underlying cipher mapping given banks of unpaired ciphertext and plaintext. We demonstrate that CipherGAN is capable of cracking language data enciphered using shift and Vigenere ciphers to a high degree of fidelity and for vocabularies much larger than previously achieved. We present how CycleGAN can be made compatible with discrete data and train in a stable way. We then prove that the technique used in CipherGAN avoids the common problem of uninformative discrimination associated with GANs applied to discrete data.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Aidan N. Gomez;Sicong Huang;Ivan Zhang;Bryan M. Li;Muhammad Osama;Lukasz Kaiser", "authorids": "aidan.n.gomez@gmail.com;huang@cs.toronto.edu;ivan@for.ai;bryan@for.ai;osama@for.ai;lukaszkaiser@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nn.2018unsupervised,\ntitle={Unsupervised Cipher Cracking Using Discrete {GAN}s},\nauthor={Aidan N. Gomez and Sicong Huang and Ivan Zhang and Bryan M. Li and Muhammad Osama and Lukasz Kaiser},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BkeqO7x0-},\n}", "github": "[![github](/images/github_icon.svg) for-ai/ciphergan](https://github.com/for-ai/ciphergan)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;8", "confidence": "1;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 3.0, "replies_avg": 9, "authors#_avg": 6, "corr_rating_confidence": 0.49999999999999994, "gs_citation": 86, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3064134608179971225&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=BkeqO7x0-", "pdf": "https://openreview.net/pdf?id=BkeqO7x0-", "email": ";;;;;", "author_num": 6 }, { "id": "BkfEzz-0-", "title": "Neuron as an Agent", "track": "main", "status": "Workshop", "tldr": "Neuron as an Agent (NaaA) enable us to train multi-agent communication without a trusted third party.", "abstract": "Existing multi-agent reinforcement learning (MARL) communication methods have relied on a trusted third party (TTP) to distribute reward to agents, leaving them inapplicable in peer-to-peer environments. This paper proposes reward distribution using {\\em Neuron as an Agent} (NaaA) in MARL without a TTP with two key ideas: (i) inter-agent reward distribution and (ii) auction theory. Auction theory is introduced because inter-agent reward distribution is insufficient for optimization. Agents in NaaA maximize their profits (the difference between reward and cost) and, as a theoretical result, the auction mechanism is shown to have agents autonomously evaluate counterfactual returns as the values of other agents. NaaA enables representation trades in peer-to-peer environments, ultimately regarding unit in neural networks as agents. Finally, numerical experiments (a single-agent environment from OpenAI Gym and a multi-agent environment from ViZDoom) confirm that NaaA framework optimization leads to better performance in reinforcement learning.", "keywords": "Multi-agent Reinforcement Learning;Communication;Reward Distribution;Trusted Third Party;Auction Theory", "primary_area": "", "supplementary_material": "", "author": "Shohei Ohsawa;Kei Akuzawa;Tatsuya Matsushima;Gustavo Bezerra;Yusuke Iwasawa;Hiroshi Kajino;Seiya Takenaka;Yutaka Matsuo", "authorids": "ohsawa@weblab.t.u-tokyo.ac.jp;akuzawa-kei@weblab.t.u-tokyo.ac.jp;matsushima@weblab.t.u-tokyo.ac.jp;gustavo@weblab.t.u-tokyo.ac.jp;iwasawa@weblab.t.u-tokyo.ac.jp;kjn@jp.ibm.com;s.takenaka@aediworks.com;matsuo@weblab.t.u-tokyo.ac.jp", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\nohsawa2018neuron,\ntitle={Neuron as an Agent},\nauthor={Shohei Ohsawa and Kei Akuzawa and Tatsuya Matsushima and Gustavo Bezerra and Yusuke Iwasawa and Hiroshi Kajino and Seiya Takenaka and Yutaka Matsuo},\nyear={2018},\nurl={https://openreview.net/forum?id=BkfEzz-0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=BkfEzz-0-", "pdf_size": 0, "rating": "3;6;7", "confidence": "5;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 16, "authors#_avg": 8, "corr_rating_confidence": -0.9607689228305228, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16334435829222554033&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "Bki1Ct1AW", "title": "Baseline-corrected space-by-time non-negative matrix factorization for decoding single trial population spike trains", "track": "main", "status": "Reject", "tldr": "We extended single-trial space-by-time tensor decomposition based on non-negative matrix factorization to efficiently discount pre-stimulus baseline activity that improves decoding performance on data with non-negligible baselines.", "abstract": "Activity of populations of sensory neurons carries stimulus information in both the temporal and the spatial dimensions. This poses the question of how to compactly represent all the information that the population codes carry across all these dimensions. Here, we developed an analytical method to factorize a large number of retinal ganglion cells' spike trains into a robust low-dimensional representation that captures efficiently both their spatial and temporal information. In particular, we extended previously used single-trial space-by-time tensor decomposition based on non-negative matrix factorization to efficiently discount pre-stimulus baseline activity. On data recorded from retinal ganglion cells with strong pre-stimulus baseline, we showed that in situations were the stimulus elicits a strong change in firing rate, our extensions yield a boost in stimulus decoding performance. Our results thus suggest that taking into account the baseline can be important for finding a compact information-rich representation of neural activity.", "keywords": "Space-by-time non-negative matrix factorization;dimensionality reduction;baseline correction;neuronal decoding;mutual information", "primary_area": "", "supplementary_material": "", "author": "Arezoo Alizadeh;Marion Mutter;Thomas M\u00fcnch;Arno Onken;Stefano Panzeri", "authorids": "arezoo.alizadehkhajehiem@iit.it;marion.mutter@gmx.de;thomas.muench@cin.uni-tuebingen.de;aonken@inf.ed.ac.uk;stefano.panzeri@iit.it", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nalizadeh2018baselinecorrected,\ntitle={Baseline-corrected space-by-time non-negative matrix factorization for decoding single trial population spike trains},\nauthor={Arezoo Alizadeh and Marion Mutter and Thomas M\u00fcnch and Arno Onken and Stefano Panzeri},\nyear={2018},\nurl={https://openreview.net/forum?id=Bki1Ct1AW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Bki1Ct1AW", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;3;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 7, "authors#_avg": 5, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hEHd6YQJwHEJ:scholar.google.com/&scioq=Baseline-corrected+space-by-time+non-negative+matrix+factorization+for+decoding+single+trial+population+spike+trains&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "Bki4EfWCb", "title": "Inference Suboptimality in Variational Autoencoders", "track": "main", "status": "Workshop", "tldr": "We decompose the gap between the marginal log-likelihood and the evidence lower bound and study the effect of the approximate posterior on the true posterior distribution in VAEs.", "abstract": "Amortized inference has led to efficient approximate inference for large datasets. The quality of posterior inference is largely determined by two factors: a) the ability of the variational distribution to model the true posterior and b) the capacity of the recognition network to generalize inference over all datapoints. We analyze approximate inference in variational autoencoders in terms of these factors. We find that suboptimal inference is often due to amortizing inference rather than the limited complexity of the approximating distribution. We show that this is due partly to the generator learning to accommodate the choice of approximation. Furthermore, we show that the parameters used to increase the expressiveness of the approximation play a role in generalizing inference rather than simply improving the complexity of the approximation.", "keywords": "Approximate Inference;Amortization;Posterior Approximations;Variational Autoencoder", "primary_area": "", "supplementary_material": "", "author": "Chris Cremer;Xuechen Li;David Duvenaud", "authorids": "ccremer@cs.toronto.edu;lxuechen@cs.toronto.edu;duvenaud@cs.toronto.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ncremer2018inference,\ntitle={Inference Suboptimality in Variational Autoencoders},\nauthor={Chris Cremer and Xuechen Li and David Duvenaud},\nyear={2018},\nurl={https://openreview.net/forum?id=Bki4EfWCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Bki4EfWCb", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;5;5", "rating_avg": 6.0, "confidence_avg": 4.666666666666667, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 349, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16046280884129751666&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "id": "BkiIkBJ0b", "title": "Do Deep Reinforcement Learning Algorithms really Learn to Navigate?", "track": "main", "status": "Reject", "tldr": "We quantitatively and qualitatively evaluate deep reinforcement learning based navigation methods under a variety of conditions to answer the question of how close they are to replacing classical path planners and mapping algorithms.", "abstract": "Deep reinforcement learning (DRL) algorithms have demonstrated progress in learning to find a goal in challenging environments. As the title of the paper by Mirowski et al. (2016) suggests, one might assume that DRL-based algorithms are able to \u201clearn to navigate\u201d and are thus ready to replace classical mapping and path-planning algorithms, at least in simulated environments. Yet, from experiments and analysis in this earlier work, it is not clear what strategies are used by these algorithms in navigating the mazes and finding the goal. In this paper, we pose and study this underlying question: are DRL algorithms doing some form of mapping and/or path-planning? Our experiments show that the algorithms are not memorizing the maps of mazes at the testing stage but, rather, at the training stage. Hence, the DRL algorithms fall short of qualifying as mapping or path-planning algorithms with any reasonable definition of mapping. We extend the experiments in Mirowski et al. (2016) by separating the set of training and testing maps and by a more ablative coverage of the space of experiments. Our systematic experiments show that the NavA3C-D1-D2-L algorithm, when trained and tested on the same maps, is able to choose the shorter paths to the goal. However, when tested on unseen maps the algorithm utilizes a wall-following strategy to find the goal without doing any mapping or path planning.", "keywords": "deep reinforcement learning;navigation;path-planning;mapping", "primary_area": "", "supplementary_material": "", "author": "Shurjo Banerjee;Vikas Dhiman;Brent Griffin;Jason J. Corso", "authorids": "shurjo@umich.edu;dhiman@umich.edu;griffb@umich.edu;jjcorso@umich.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbanerjee2018do,\ntitle={Do Deep Reinforcement Learning Algorithms really Learn to Navigate?},\nauthor={Shurjo Banerjee and Vikas Dhiman and Brent Griffin and Jason J. Corso},\nyear={2018},\nurl={https://openreview.net/forum?id=BkiIkBJ0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BkiIkBJ0b", "pdf_size": 0, "rating": "3;3;7", "confidence": "5;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7352421671492660232&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Zero-Shot Visual Imitation", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/51", "id": "BkisuzWRW", "author_site": "Deepak Pathak, Parsa Mahmoudieh, Guanghao Luo, Pulkit Agrawal, Dian Chen, Fred Shentu, Evan Shelhamer, Jitendra Malik, Alexei Efros, Trevor Darrell", "tldr": "Agents can learn to imitate solely visual demonstrations (without actions) at test time after learning from their own experience without any form of supervision at training time.", "abstract": "The current dominant paradigm for imitation learning relies on strong supervision of expert actions to learn both 'what' and 'how' to imitate. We pursue an alternative paradigm wherein an agent first explores the world without any expert supervision and then distills its experience into a goal-conditioned skill policy with a novel forward consistency loss. In our framework, the role of the expert is only to communicate the goals (i.e., what to imitate) during inference. The learned policy is then employed to mimic the expert (i.e., how to imitate) after seeing just a sequence of images demonstrating the desired task. Our method is 'zero-shot' in the sense that the agent never has access to expert actions during training or for the task demonstration at inference. We evaluate our zero-shot imitator in two real-world settings: complex rope manipulation with a Baxter robot and navigation in previously unseen office environments with a TurtleBot. Through further experiments in VizDoom simulation, we provide evidence that better mechanisms for exploration lead to learning a more capable policy which in turn improves end task performance. Videos, models, and more details are available at https://pathak22.github.io/zeroshot-imitation/.", "keywords": "imitation;zero-shot;self-supervised;robotics;skills;navigation;manipulation;vizdoom;reinforcement", "primary_area": "", "supplementary_material": "", "author": "Deepak Pathak;Parsa Mahmoudieh;Guanghao Luo;Pulkit Agrawal;Dian Chen;Yide Shentu;Evan Shelhamer;Jitendra Malik;Alexei A. Efros;Trevor Darrell", "authorids": "pathak@berkeley.edu;parsa.m@berkeley.edu;michaelluo@berkeley.edu;pulkitag@berkeley.edu;dianchen@berkeley.edu;fredshentu@berkeley.edu;shelhamer@cs.berkeley.edu;malik@eecs.berkeley.edu;efros@eecs.berkeley.edu;trevor@eecs.berkeley.edu", "gender": ";;;;;;;;;", "homepage": ";;;;;;;;;", "dblp": ";;;;;;;;;", "google_scholar": ";;;;;;;;;", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;;;", "or_profile": ";;;;;;;;;", "aff": ";;;;;;;;;", "aff_domain": ";;;;;;;;;", "position": ";;;;;;;;;", "bibtex": "@inproceedings{\npathak*2018zeroshot,\ntitle={Zero-Shot Visual Imitation},\nauthor={Deepak Pathak* and Parsa Mahmoudieh* and Michael Luo* and Pulkit Agrawal* and Dian Chen and Fred Shentu and Evan Shelhamer and Jitendra Malik and Alexei A. Efros and Trevor Darrell},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BkisuzWRW},\n}", "github": "[![github](/images/github_icon.svg) pathak22/zeroshot-imitation](https://github.com/pathak22/zeroshot-imitation)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "7;8;8", "confidence": "5;3;4", "rating_avg": 7.666666666666667, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 10, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 339, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15276541363750863723&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "openreview": "https://openreview.net/forum?id=BkisuzWRW", "pdf": "https://openreview.net/pdf?id=BkisuzWRW", "email": ";;;;;;;;;", "author_num": 10 }, { "id": "BkktYCkZf", "title": "Per-Weight Class-Based Learning Rates via Analytical Continuation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We study the problem of training deep fully connected neural networks. Despite much progress in the design of activation functions, novel normalization techniques, and various skip-connection techniques, such networks remain challenging to train due to vanishing or exploding gradients. Our method is based on employing a different class-dependent learning rate to each network weight. Since the learning rates are hyperparameters and not part of the network, we perform an analytical continuation of the network, and create a generalized network. Following this reparameterization, the set of per-class per-weight learning rates are being manipulated during the training iterations. Our results show that the new algorithm leads to improved classification accuracy for both classical and modern activation functions.", "keywords": "adaptive learning rates;analytical continuation;fully connected networks", "primary_area": "", "supplementary_material": "", "author": "Michael Rotman;Lior Wolf", "authorids": "migo007@gmail.com;wolf@fb.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BkktYCkZf", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;2", "rating_avg": 3.0, "confidence_avg": 3.3333333333333335, "replies_avg": 3, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:C_V0gMvccvwJ:scholar.google.com/&scioq=Per-Weight+Class-Based+Learning+Rates+via+Analytical+Continuation&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "Bkl1uWb0Z", "title": "Inducing Grammars with and for Neural Machine Translation", "track": "main", "status": "Reject", "tldr": "improve NMT with latent trees", "abstract": "Previous work has demonstrated the benefits of incorporating additional linguistic annotations such as syntactic trees into neural machine translation. However the cost of obtaining those syntactic annotations is expensive for many languages and the quality of unsupervised learning linguistic structures is too poor to be helpful. In this work, we aim to improve neural machine translation via source side dependency syntax but without explicit annotation. We propose a set of models that learn to induce dependency trees on the source side and learn to use that information on the target side. Importantly, we also show that our dependency trees capture important syntactic features of language and improve translation quality on two language pairs En-De and En-Ru.", "keywords": "structured attention;neural machine translation;grammar induction", "primary_area": "", "supplementary_material": "", "author": "Ke Tran;Yonatan Bisk", "authorids": "ketranmanh@gmail.com;ybisk@yonatanbisk.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntran2018inducing,\ntitle={Inducing Grammars with and for Neural Machine Translation},\nauthor={Ke Tran and Yonatan Bisk},\nyear={2018},\nurl={https://openreview.net/forum?id=Bkl1uWb0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Bkl1uWb0Z", "pdf_size": 0, "rating": "3;5;6", "confidence": "5;4;5", "rating_avg": 4.666666666666667, "confidence_avg": 4.666666666666667, "replies_avg": 11, "authors#_avg": 2, "corr_rating_confidence": -0.18898223650461357, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8817242039040701511&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "BkoCeqgR-", "title": "On the Construction and Evaluation of Color Invariant Networks", "track": "main", "status": "Reject", "tldr": "We construct and evaluate color invariant neural nets on a novel realistic data set", "abstract": "This is an empirical paper which constructs color invariant networks and evaluates their performances on a realistic data set. The paper studies the simplest possible case of color invariance: invariance under pixel-wise permutation of the color channels. Thus the network is aware not of the specific color object, but its colorfulness. The data set introduced in the paper consists of images showing crashed cars from which ten classes were extracted. An additional annotation was done which labeled whether the car shown was red or non-red. The networks were evaluated by their performance on the classification task. With the color annotation we altered the color ratios in the training data and analyzed the generalization capabilities of the networks on the unaltered test data. We further split the test data in red and non-red cars and did a similar evaluation. It is shown in the paper that an pixel-wise ordering of the rgb-values of the images performs better or at least similarly for small deviations from the true color ratios. The limits of these networks are also discussed.", "keywords": "deep learning;invariance;data set;evaluation", "primary_area": "", "supplementary_material": "", "author": "Konrad Groh", "authorids": "konrad.groh@de.bosch.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\ngroh2018on,\ntitle={On the Construction and Evaluation of Color Invariant Networks},\nauthor={Konrad Groh},\nyear={2018},\nurl={https://openreview.net/forum?id=BkoCeqgR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BkoCeqgR-", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 1, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YtvnoEVGYaIJ:scholar.google.com/&scioq=On+the+Construction+and+Evaluation+of+Color+Invariant+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "BkoXnkWAb", "title": "Shifting Mean Activation Towards Zero with Bipolar Activation Functions", "track": "main", "status": "Workshop", "tldr": "", "abstract": "We propose a simple extension to the ReLU-family of activation functions that allows them to shift the mean activation across a layer towards zero. Combined with proper weight initialization, this alleviates the need for normalization layers. We explore the training of deep vanilla recurrent neural networks (RNNs) with up to 144 layers, and show that bipolar activation functions help learning in this setting. On the Penn Treebank and Text8 language modeling tasks we obtain competitive results, improving on the best reported results for non-gated networks. In experiments with convolutional neural networks without batch normalization, we find that bipolar activations produce a faster drop in training error, and results in a lower test error on the CIFAR-10 classification task.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lars Hiller Eidnes;Arild N\u00f8kland", "authorids": "larseidnes@gmail.com;arild.nokland@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nhiller2018shifting,\ntitle={Shifting Mean Activation Towards Zero with Bipolar Activation Functions},\nauthor={Lars Hiller Eidnes and Arild N\u00f8kland},\nyear={2018},\nurl={https://openreview.net/forum?id=BkoXnkWAb},\n}", "github": "[![github](/images/github_icon.svg) larspars/word-rnn](https://github.com/larspars/word-rnn)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BkoXnkWAb", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;5;3", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4922137843063373202&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "BkpXqwUTZ", "title": "Iterative temporal differencing with fixed random feedback alignment support spike-time dependent plasticity in vanilla backpropagation for deep learning", "track": "main", "status": "Reject", "tldr": "Iterative temporal differencing with fixed random feedback alignment support spike-time dependent plasticity in vanilla backpropagation for deep learning.", "abstract": "In vanilla backpropagation (VBP), activation function matters considerably in terms of non-linearity and differentiability.\nVanishing gradient has been an important problem related to the bad choice of activation function in deep learning (DL).\nThis work shows that a differentiable activation function is not necessary any more for error backpropagation. \nThe derivative of the activation function can be replaced by an iterative temporal differencing (ITD) using fixed random feedback weight alignment (FBA).\nUsing FBA with ITD, we can transform the VBP into a more biologically plausible approach for learning deep neural network architectures.\nWe don't claim that ITD works completely the same as the spike-time dependent plasticity (STDP) in our brain but this work can be a step toward the integration of STDP-based error backpropagation in deep learning.", "keywords": "Iterative temporal differencing;feedback alignment;spike-time dependent plasticity;vanilla backpropagation;deep learning", "primary_area": "", "supplementary_material": "", "author": "Aras Dargazany;Kunal Mankodiya", "authorids": "arasdar@uri.edu;arasdar@uri.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ndargazany2018iterative,\ntitle={Iterative temporal differencing with fixed random feedback alignment support spike-time dependent plasticity in vanilla backpropagation for deep learning},\nauthor={Aras Dargazany and Kunal Mankodiya},\nyear={2018},\nurl={https://openreview.net/forum?id=BkpXqwUTZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BkpXqwUTZ", "pdf_size": 0, "rating": "2;2;3", "confidence": "5;5;4", "rating_avg": 2.3333333333333335, "confidence_avg": 4.666666666666667, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:e2QGVAu79agJ:scholar.google.com/&scioq=Iterative+temporal+differencing+with+fixed+random+feedback+alignment+support+spike-time+dependent+plasticity+in+vanilla+backpropagation+for+deep+learning&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Decision Boundary Analysis of Adversarial Examples", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/57", "id": "BkpiPMbA-", "author_site": "Warren He, Bo Li, Dawn Song", "tldr": "Looking at decision boundaries around an input gives you more information than a fixed small neighborhood", "abstract": "Deep neural networks (DNNs) are vulnerable to adversarial examples, which are carefully crafted instances aiming to cause prediction errors for DNNs. Recent research on adversarial examples has examined local neighborhoods in the input space of DNN models. However, previous work has limited what regions to consider, focusing either on low-dimensional subspaces or small balls. In this paper, we argue that information from larger neighborhoods, such as from more directions and from greater distances, will better characterize the relationship between adversarial examples and the DNN models. First, we introduce an attack, OPTMARGIN, which generates adversarial examples robust to small perturbations. These examples successfully evade a defense that only considers a small ball around an input instance. Second, we analyze a larger neighborhood around input instances by looking at properties of surrounding decision boundaries, namely the distances to the boundaries and the adjacent classes. We find that the boundaries around these adversarial examples do not resemble the boundaries around benign examples. Finally, we show that, under scrutiny of the surrounding decision boundaries, our OPTMARGIN examples do not convincingly mimic benign examples. Although our experiments are limited to a few specific attacks, we hope these findings will motivate new, more evasive attacks and ultimately, effective defenses.", "keywords": "adversarial machine learning;supervised representation learning;decision regions;decision boundaries", "primary_area": "", "supplementary_material": "", "author": "Warren He;Bo Li;Dawn Song", "authorids": "_w@eecs.berkeley.edu;lxbosky@gmail.com;dawnsong.travel@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nhe2018decision,\ntitle={Decision Boundary Analysis of Adversarial Examples},\nauthor={Warren He and Bo Li and Dawn Song},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BkpiPMbA-},\n}", "github": "[![github](/images/github_icon.svg) sunblaze-ucb/decision-boundaries](https://github.com/sunblaze-ucb/decision-boundaries)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;3;2", "rating_avg": 6.0, "confidence_avg": 2.6666666666666665, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 170, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14822232947259136601&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=BkpiPMbA-", "pdf": "https://openreview.net/pdf?id=BkpiPMbA-", "email": ";;", "author_num": 3 }, { "title": "Loss-aware Weight Quantization of Deep Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/203", "id": "BkrSv0lA-", "author_site": "LU HOU, James Kwok", "tldr": "A loss-aware weight quantization algorithm that directly considers its effect on the loss is proposed.", "abstract": "The huge size of deep networks hinders their use in small computing devices. In this paper, we consider compressing the network by weight quantization. We extend a recently proposed loss-aware weight binarization scheme to ternarization, with possibly different scaling parameters for the positive and negative weights, and m-bit (where m > 2) quantization. Experiments on feedforward and recurrent neural networks show that the proposed scheme outperforms state-of-the-art weight quantization algorithms, and is as accurate (or even more accurate) than the full-precision network.", "keywords": "deep learning;network quantization", "primary_area": "", "supplementary_material": "", "author": "Lu Hou;James T. Kwok", "authorids": "lhouab@cse.ust.hk;jamesk@cse.ust.hk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nhou2018lossaware,\ntitle={Loss-aware Weight Quantization of Deep Networks},\nauthor={Lu Hou and James T. Kwok},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BkrSv0lA-},\n}", "github": "[![github](/images/github_icon.svg) houlu369/Loss-aware-weight-quantization](https://github.com/houlu369/Loss-aware-weight-quantization)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;8", "confidence": "4;4;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 171, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17603219917891692242&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=BkrSv0lA-", "pdf": "https://openreview.net/pdf?id=BkrSv0lA-", "email": ";", "author_num": 2 }, { "title": "Online Learning Rate Adaptation with Hypergradient Descent", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/14", "id": "BkrsAzWAb", "author_site": "Atilim Gunes Baydin, Robert Cornish, David Mart\u00ednez, Mark Schmidt, Frank Wood", "tldr": "", "abstract": "We introduce a general method for improving the convergence rate of gradient-based optimizers that is easy to implement and works well in practice. We demonstrate the effectiveness of the method in a range of optimization problems by applying it to stochastic gradient descent, stochastic gradient descent with Nesterov momentum, and Adam, showing that it significantly reduces the need for the manual tuning of the initial learning rate for these commonly used algorithms. Our method works by dynamically updating the learning rate during optimization using the gradient with respect to the learning rate of the update rule itself. Computing this \"hypergradient\" needs little additional computation, requires only one extra copy of the original gradient to be stored in memory, and relies upon nothing more than what is provided by reverse-mode automatic differentiation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Atilim Gunes Baydin;Robert Cornish;David Martinez Rubio;Mark Schmidt;Frank Wood", "authorids": "gunes@robots.ox.ac.uk;rcornish@robots.ox.ac.uk;david.martinez2@wadh.ox.ac.uk;schmidtm@cs.ubc.ca;fwood@robots.ox.ac.uk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ngunes2018online,\ntitle={Online Learning Rate Adaptation with Hypergradient Descent},\nauthor={Atilim Gunes Baydin and Robert Cornish and David Martinez Rubio and Mark Schmidt and Frank Wood},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BkrsAzWAb},\n}", "github": "[![github](/images/github_icon.svg) gbaydin/hypergradient-descent](https://github.com/gbaydin/hypergradient-descent) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=BkrsAzWAb)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 5, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 301, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2792585694661059835&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=BkrsAzWAb", "pdf": "https://openreview.net/pdf?id=BkrsAzWAb", "email": ";;;;", "author_num": 5 }, { "id": "Bkv76ilDz", "title": "Discrete Wasserstein Generative Adversarial Networks (DWGAN)", "track": "main", "status": "Withdraw", "tldr": "We propose a Discrete Wasserstein GAN (DWGAN) model which is based on a dual formulation of the Wasserstein distance between two discrete distributions.", "abstract": "Generating complex discrete distributions remains as one of the challenging problems in machine learning. Existing techniques for generating complex distributions with high degrees of freedom depend on standard generative models like Generative Adversarial Networks (GAN), Wasserstein GAN, and associated variations. Such models are based on an optimization involving the distance between two continuous distributions. We introduce a Discrete Wasserstein GAN (DWGAN) model which is based on a dual formulation of the Wasserstein distance between two discrete distributions. We derive a novel training algorithm and corresponding network architecture based on the formulation. Experimental results are provided for both synthetic discrete data, and real discretized data from MNIST handwritten digits.", "keywords": "GAN;wasserstein distance;discrete probability distribution", "primary_area": "", "supplementary_material": "", "author": "Rizal Fathony;Naveen Goela", "authorids": "rfatho2@uic.edu;naveen.goela@technicolor.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nfathony2018discrete,\ntitle={Discrete Wasserstein Generative Adversarial Networks ({DWGAN})},\nauthor={Rizal Fathony, Naveen Goela},\nyear={2018},\nurl={https://openreview.net/forum?id=H1AdTAxC-},\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=Bkv76ilDz", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 0, "authors#_avg": 2, "corr_rating_confidence": 0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11134671787683031395&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Learning One-hidden-layer Neural Networks with Landscape Design", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/119", "id": "BkwHObbRZ", "author_site": "Rong Ge, Jason Lee, Tengyu Ma", "tldr": "The paper analyzes the optimization landscape of one-hidden-layer neural nets and designs a new objective that provably has no spurious local minimum. ", "abstract": "We consider the problem of learning a one-hidden-layer neural network: we assume the input x is from Gaussian distribution and the label $y = a \\sigma(Bx) + \\xi$, where a is a nonnegative vector and $B$ is a full-rank weight matrix, and $\\xi$ is a noise vector. We first give an analytic formula for the population risk of the standard squared loss and demonstrate that it implicitly attempts to decompose a sequence of low-rank tensors simultaneously. \n\t\nInspired by the formula, we design a non-convex objective function $G$ whose landscape is guaranteed to have the following properties:\t\n\n1. All local minima of $G$ are also global minima.\n2. All global minima of $G$ correspond to the ground truth parameters.\n3. The value and gradient of $G$ can be estimated using samples.\n\t\nWith these properties, stochastic gradient descent on $G$ provably converges to the global minimum and learn the ground-truth parameters. We also prove finite sample complexity results and validate the results by simulations. ", "keywords": "theory;non-convex optimization;loss surface", "primary_area": "", "supplementary_material": "", "author": "Rong Ge;Jason D. Lee;Tengyu Ma", "authorids": "rongge@cs.duke.edu;jasondlee88@gmail.com;tengyuma@cs.stanford.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nge2018learning,\ntitle={Learning One-hidden-layer Neural Networks with Landscape Design},\nauthor={Rong Ge and Jason D. Lee and Tengyu Ma},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BkwHObbRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;9", "confidence": "3;3;3", "rating_avg": 7.333333333333333, "confidence_avg": 3.0, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 318, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14789158572261278986&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=BkwHObbRZ", "pdf": "https://openreview.net/pdf?id=BkwHObbRZ", "email": ";;", "author_num": 3 }, { "title": "A Framework for the Quantitative Evaluation of Disentangled Representations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/55", "id": "By-7dz-AZ", "author_site": "Cian Eastwood, Chris Williams", "tldr": "", "abstract": "Recent AI research has emphasised the importance of learning disentangled representations of the explanatory factors behind data. Despite the growing interest in models which can learn such representations, visual inspection remains the standard evaluation metric. While various desiderata have been implied in recent definitions, it is currently unclear what exactly makes one disentangled representation better than another. In this work we propose a framework for the quantitative evaluation of disentangled representations when the ground-truth latent structure is available. Three criteria are explicitly defined and quantified to elucidate the quality of learnt representations and thus compare models on an equal basis. To illustrate the appropriateness of the framework, we employ it to compare quantitatively the representations learned by recent state-of-the-art models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Cian Eastwood;Christopher K. I. Williams", "authorids": "s1668298@ed.ac.uk;ckiw@inf.ed.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\neastwood2018a,\ntitle={A framework for the quantitative evaluation of disentangled representations},\nauthor={Cian Eastwood and Christopher K. I. Williams},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=By-7dz-AZ},\n}", "github": "[![github](/images/github_icon.svg) cianeastwood/qedr](https://github.com/cianeastwood/qedr) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=By-7dz-AZ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;5;5", "rating_avg": 6.333333333333333, "confidence_avg": 4.666666666666667, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 537, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3224087322020629595&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=By-7dz-AZ", "pdf": "https://openreview.net/pdf?id=By-7dz-AZ", "email": ";", "author_num": 2 }, { "id": "By-IifZRW", "title": "Gaussian Process Neurons", "track": "main", "status": "Reject", "tldr": "We model the activation function of each neuron as a Gaussian Process and learn it alongside the weight with Variational Inference.", "abstract": "We propose a method to learn stochastic activation functions for use in probabilistic neural networks.\nFirst, we develop a framework to embed stochastic activation functions based on Gaussian processes in probabilistic neural networks.\nSecond, we analytically derive expressions for the propagation of means and covariances in such a network, thus allowing for an efficient implementation and training without the need for sampling.\nThird, we show how to apply variational Bayesian inference to regularize and efficiently train this model.\nThe resulting model can deal with uncertain inputs and implicitly provides an estimate of the confidence of its predictions.\nLike a conventional neural network it can scale to datasets of arbitrary size and be extended with convolutional and recurrent connections, if desired.", "keywords": "gaussian process neuron activation function stochastic transfer function learning variational bayes probabilistic", "primary_area": "", "supplementary_material": "", "author": "Sebastian Urban;Patrick van der Smagt", "authorids": "surban@tum.de;smagt@brml.org", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nurban2018gaussian,\ntitle={Gaussian Process Neurons},\nauthor={Sebastian Urban and Patrick van der Smagt},\nyear={2018},\nurl={https://openreview.net/forum?id=By-IifZRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=By-IifZRW", "pdf_size": 0, "rating": "4;5;7", "confidence": "5;4;2", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 2, "corr_rating_confidence": -1.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6774929047989506357&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "By03VlJGG", "title": "Embedding Multimodal Relational Data", "track": "main", "status": "Withdraw", "tldr": "Extending relational modeling to support multimodal data using neural encoders.", "abstract": "Representing entities and relations in an embedding space is a well-studied approach for machine learning on relational data. Existing approaches however primarily focus on simple link structure between a finite set of entities, ignoring the variety of data types that are often used in relational databases, such as text, images, and numerical values. In our approach, we propose a multimodal embedding using different neural encoders for this variety of data, and combine with existing models to learn embeddings of the entities. We extend existing datasets to create two novel benchmarks, YAGO-10-plus and MovieLens-100k-plus, that contain additional relations such as textual descriptions and images of the original entities. We demonstrate that our model utilizes the additional information effectively to provide further gains in accuracy. Moreover, we test our learned multimodal embeddings by using them to predict missing multimodal attributes. ", "keywords": "multimodal;knowledge base;relational modeling;embedding;link prediction;neural network encoders", "primary_area": "", "supplementary_material": "", "author": "Pouya Pezeshkpour;Liyan Chen;Sameer Singh", "authorids": "pezeshkp@uci.edu;liyanc@uci.edu;sameer@uci.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=By03VlJGG", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;5;4", "rating_avg": 5.0, "confidence_avg": 4.333333333333333, "replies_avg": 3, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4062645943839484794&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "By0ANxbRW", "title": "DNN Model Compression Under Accuracy Constraints", "track": "main", "status": "Reject", "tldr": "Compressing trained DNN models by minimizing their complexity while constraining their loss.", "abstract": "The growing interest to implement Deep Neural Networks (DNNs) on resource-bound hardware has motivated innovation of compression algorithms. Using these algorithms, DNN model sizes can be substantially reduced, with little to no accuracy degradation. This is achieved by either eliminating components from the model, or penalizing complexity during training. While both approaches demonstrate considerable compressions, the former often ignores the loss function during compression while the later produces unpredictable compressions. In this paper, we propose a technique that directly minimizes both the model complexity and the changes in the loss function. In this technique, we formulate compression as a constrained optimization problem, and then present a solution for it. We will show that using this technique, we can achieve competitive results.", "keywords": "DNN Compression;Weigh-sharing;Model Compression", "primary_area": "", "supplementary_material": "", "author": "Soroosh Khoram;Jing Li", "authorids": "khoram@wisc.edu;jli@ece.wisc.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkhoram2018dnn,\ntitle={{DNN} Model Compression Under Accuracy Constraints},\nauthor={Soroosh Khoram and Jing Li},\nyear={2018},\nurl={https://openreview.net/forum?id=By0ANxbRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=By0ANxbRW", "pdf_size": 0, "rating": "3;3;4", "confidence": "3;5;3", "rating_avg": 3.3333333333333335, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": -0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-tT3asfPvVsJ:scholar.google.com/&scioq=DNN+Model+Compression+Under+Accuracy+Constraints&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "By3VrbbAb", "title": "Realtime query completion via deep language models", "track": "main", "status": "Reject", "tldr": "realtime search query completion using character-level LSTM language models", "abstract": "Search engine users nowadays heavily depend on query completion and correction to shape their queries. Typically, the completion is done by database lookup which does not understand the context and cannot generalize to prefixes not in the database. In the paper, we propose to use unsupervised deep language models to complete and correct the queries given an arbitrary prefix. We show how to address two main challenges that renders this method practical for large-scale deployment: 1) we propose a method for integrating error correction into the language model completion via a edit-distance potential and a variant of beam search that can exploit these potential functions; and 2) we show how to efficiently perform CPU-based computation to complete the queries, with error correction, in real time (generating top 10 completions within 16 ms). Experiments show that the method substantially increases hit rate over standard approaches, and is capable of handling tail queries.\n", "keywords": "query completion;realtime;error correction;recurrent network;beam search", "primary_area": "", "supplementary_material": "", "author": "Po-Wei Wang;J. Zico Kolter;Vijai Mohan;Inderjit S. Dhillon", "authorids": "poweiw@cs.cmu.edu;zkolter@cs.cmu.edu;vijaim@amazon.com;isd@a9.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nwang2018realtime,\ntitle={Realtime query completion via deep language models},\nauthor={Po-Wei Wang and J. Zico Kolter and Vijai Mohan and Inderjit S. Dhillon},\nyear={2018},\nurl={https://openreview.net/forum?id=By3VrbbAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=By3VrbbAb", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;3;3", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844387, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14936931705428867103&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12 }, { "id": "By3v9k-RZ", "title": "LEARNING TO ORGANIZE KNOWLEDGE WITH N-GRAM MACHINES", "track": "main", "status": "Workshop", "tldr": "We propose a framework that learns to encode knowledge symbolically and generate programs to reason about the encoded knowledge.", "abstract": "Deep neural networks (DNNs) had great success on NLP tasks such as language modeling, machine translation and certain question answering (QA) tasks. However, the success is limited at more knowledge intensive tasks such as QA from a big corpus. Existing end-to-end deep QA models (Miller et al., 2016; Weston et al., 2014) need to read the entire text after observing the question, and therefore their complexity in responding a question is linear in the text size. This is prohibitive for practical tasks such as QA from Wikipedia, a novel, or the Web. We propose to solve this scalability issue by using symbolic meaning representations, which can be indexed and retrieved efficiently with complexity that is independent of the text size. More specifically, we use sequence-to-sequence models to encode knowledge symbolically and generate programs to answer questions from the encoded knowledge. We apply our approach, called the N-Gram Machine (NGM), to the bAbI tasks (Weston et al., 2015) and a special version of them (\u201clife-long bAbI\u201d) which has stories of up to 10 million sentences. Our experiments show that NGM can successfully solve both of these tasks accurately and efficiently. Unlike fully differentiable memory models, NGM\u2019s time complexity and answering quality are not affected by the story length. The whole system of NGM is trained end-to-end with REINFORCE (Williams, 1992). To avoid high variance in gradient estimation, which is typical in discrete latent variable models, we use beam search instead of sampling. To tackle the exponentially large search space, we use a stabilized auto-encoding objective and a structure tweak procedure to iteratively reduce and refine the search space.\n", "keywords": "neuro-symbolic reasoning;information extraction;learn to search", "primary_area": "", "supplementary_material": "", "author": "Fan Yang;Jiazhong Nie;William W. Cohen;Ni Lao", "authorids": "fanyang1@cs.cmu.edu;niejiazhong@google.com;wcohen@cs.cmu.edu;nlao@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nyang2018learning,\ntitle={{LEARNING} {TO} {ORGANIZE} {KNOWLEDGE} {WITH} N-{GRAM} {MACHINES}},\nauthor={Fan Yang and Jiazhong Nie and William W. Cohen and Ni Lao},\nyear={2018},\nurl={https://openreview.net/forum?id=By3v9k-RZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=By3v9k-RZ", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;3;4", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": 0.4999999999999999, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "Deep Learning for Physical Processes: Incorporating Prior Scientific Knowledge", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/40", "id": "By4HsfWAZ", "author_site": "Emmanuel d Bezenac, Arthur Pajot, patrick Gallinari", "tldr": "", "abstract": "We consider the use of Deep Learning methods for modeling complex phenomena like those occurring in natural physical processes. With the large amount of data gathered on these phenomena the data intensive paradigm could begin to challenge more traditional approaches elaborated over the years in fields like maths or physics. However, despite considerable successes in a variety of application domains, the machine learning field is not yet ready to handle the level of complexity required by such problems. Using an example application, namely Sea Surface Temperature Prediction, we show how general background knowledge gained from the physics could be used as a guideline for designing efficient Deep Learning models. In order to motivate the approach and to assess its generality we demonstrate a formal link between the solution of a class of differential equations underlying a large family of physical phenomena and the proposed model. Experiments and comparison with series of baselines including a state of the art numerical approach is then provided.", "keywords": "deep learning;physical processes;forecasting;spatio-temporal", "primary_area": "", "supplementary_material": "", "author": "Emmanuel de Bezenac;Arthur Pajot;Patrick Gallinari", "authorids": "emmanuel.de_bezenac@lip6.fr;arthur.pajot@lip6.fr;patrick.gallinari@lip6.fr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nde2018deep,\ntitle={Deep Learning for Physical Processes: Incorporating Prior Scientific Knowledge},\nauthor={Emmanuel de Bezenac and Arthur Pajot and Patrick Gallinari},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=By4HsfWAZ},\n}", "github": "[![github](/images/github_icon.svg) emited/flow](https://github.com/emited/flow) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=By4HsfWAZ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "2;3;3", "rating_avg": 6.666666666666667, "confidence_avg": 2.6666666666666665, "replies_avg": 16, "authors#_avg": 3, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 412, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=339008717685681020&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "openreview": "https://openreview.net/forum?id=By4HsfWAZ", "pdf": "https://openreview.net/pdf?id=By4HsfWAZ", "email": ";;", "author_num": 3 }, { "id": "By5SY2gA-", "title": "Towards Building Affect sensitive Word Distributions", "track": "main", "status": "Reject", "tldr": "Enriching word embeddings with affect information improves their performance on sentiment prediction tasks.", "abstract": "Learning word representations from large available corpora relies on the distributional hypothesis that words present in similar contexts tend to have similar meanings. Recent work has shown that word representations learnt in this manner lack sentiment information which, fortunately, can be leveraged using external knowledge. Our work addresses the question: can affect lexica improve the word representations learnt from a corpus? In this work, we propose techniques to incorporate affect lexica, which capture fine-grained information about a word's psycholinguistic and emotional orientation, into the training process of Word2Vec SkipGram, Word2Vec CBOW and GloVe methods using a joint learning approach. We use affect scores from Warriner's affect lexicon to regularize the vector representations learnt from an unlabelled corpus. Our proposed method outperforms previously proposed methods on standard tasks for word similarity detection, outlier detection and sentiment detection. We also demonstrate the usefulness of our approach for a new task related to the prediction of formality, frustration and politeness in corporate communication.", "keywords": "Affect lexicon;word embeddings;Word2Vec;GloVe;WordNet;joint learning;sentiment analysis;word similarity;outlier detection;affect prediction", "primary_area": "", "supplementary_material": "", "author": "Kushal Chawla;Sopan Khosla;Niyati Chhaya;Kokil Jaidka", "authorids": "kchawla@adobe.com;skhosla@adobe.com;nchhaya@adobe.com;jaidka@sas.upenn.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nchawla2018towards,\ntitle={Towards Building Affect sensitive Word Distributions},\nauthor={Kushal Chawla and Sopan Khosla and Niyati Chhaya and Kokil Jaidka},\nyear={2018},\nurl={https://openreview.net/forum?id=By5SY2gA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=By5SY2gA-", "pdf_size": 0, "rating": "4;4;6", "confidence": "5;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13071511067548387173&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "By5ugjyCb", "title": "PACT: Parameterized Clipping Activation for Quantized Neural Networks", "track": "main", "status": "Reject", "tldr": "A new way of quantizing activation of Deep Neural Network via parameterized clipping which optimizes the quantization scale via stochastic gradient descent.", "abstract": "Deep learning algorithms achieve high classification accuracy at the expense of significant computation cost. To address this cost, a number of quantization schemeshave been proposed - but most of these techniques focused on quantizing weights, which are relatively smaller in size compared to activations. This paper proposes a novel quantization scheme for activations during training - that enables neural networks to work well with ultra low precision weights and activations without any significant accuracy degradation. This technique, PArameterized Clipping acTi-vation (PACT), uses an activation clipping parameter \u03b1 that is optimized duringtraining to find the right quantization scale. PACT allows quantizing activations toarbitrary bit precisions, while achieving much better accuracy relative to publishedstate-of-the-art quantization schemes. We show, for the first time, that both weights and activations can be quantized to 4-bits of precision while still achieving accuracy comparable to full precision networks across a range of popular models and datasets. We also show that exploiting these reduced-precision computational units in hardware can enable a super-linear improvement in inferencing performance dueto a significant reduction in the area of accelerator compute engines coupled with the ability to retain the quantized model and activation data in on-chip memories.", "keywords": "deep learning;quantized deep neural network;activation quantization", "primary_area": "", "supplementary_material": "", "author": "Jungwook Choi;Zhuo Wang;Swagath Venkataramani;Pierce I-Jen Chuang;Vijayalakshmi Srinivasan;Kailash Gopalakrishnan", "authorids": "choij@us.ibm.com;choij@us.ibm.com;choij@us.ibm.com;choij@us.ibm.com;choij@us.ibm.com;choij@us.ibm.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nchoi2018pact,\ntitle={{PACT}: Parameterized Clipping Activation for Quantized Neural Networks},\nauthor={Jungwook Choi and Zhuo Wang and Swagath Venkataramani and Pierce I-Jen Chuang and Vijayalakshmi Srinivasan and Kailash Gopalakrishnan},\nyear={2018},\nurl={https://openreview.net/forum?id=By5ugjyCb},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=By5ugjyCb)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=By5ugjyCb", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;5;4", "rating_avg": 5.0, "confidence_avg": 4.333333333333333, "replies_avg": 12, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 1194, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17925554201740498286&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "By9iRkWA-", "title": "Phase Conductor on Multi-layered Attentions for Machine Comprehension", "track": "main", "status": "Reject", "tldr": "", "abstract": "Attention models have been intensively studied to improve NLP tasks such as machine comprehension via both question-aware passage attention model and self-matching attention model. Our research proposes phase conductor (PhaseCond) for attention models in two meaningful ways. First, PhaseCond, an architecture of multi-layered attention models, consists of multiple phases each implementing a stack of attention layers producing passage representations and a stack of inner or outer fusion layers regulating the information flow. Second, we extend and improve the dot-product attention function for PhaseCond by simultaneously encoding multiple question and passage embedding layers from different perspectives. We demonstrate the effectiveness of our proposed model PhaseCond on the SQuAD dataset, showing that our model significantly outperforms both state-of-the-art single-layered and multiple-layered attention models. We deepen our results with new findings via both detailed qualitative analysis and visualized examples showing the dynamic changes through multi-layered attention models.", "keywords": "Attention Model;Machine Comprehension;Question Answering", "primary_area": "", "supplementary_material": "", "author": "Rui Liu;Wei Wei;Weiguang Mao;Maria Chikina", "authorids": "ult.rui.liu@gmail.com;weiwei@cs.cmu.edu;mwg10.thu@gmail.com;mchikina@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nliu2018phase,\ntitle={Phase Conductor on Multi-layered Attentions for Machine Comprehension},\nauthor={Rui Liu and Wei Wei and Weiguang Mao and Maria Chikina},\nyear={2018},\nurl={https://openreview.net/forum?id=By9iRkWA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=By9iRkWA-", "pdf_size": 0, "rating": "5;5;8", "confidence": "5;4;3", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844387, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12131973995838314634&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "title": "Parameter Space Noise for Exploration", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/228", "id": "ByBAl2eAZ", "author_site": "Matthias Plappert, Rein Houthooft, Prafulla Dhariwal, Szymon Sidor, Richard Chen, Xi Chen, Tamim Asfour, Pieter Abbeel, Marcin Andrychowicz", "tldr": "Parameter space noise allows reinforcement learning algorithms to explore by perturbing parameters instead of actions, often leading to significantly improved exploration performance.", "abstract": "Deep reinforcement learning (RL) methods generally engage in exploratory behavior through noise injection in the action space. An alternative is to add noise directly to the agent's parameters, which can lead to more consistent exploration and a richer set of behaviors. Methods such as evolutionary strategies use parameter perturbations, but discard all temporal structure in the process and require significantly more samples. Combining parameter noise with traditional RL methods allows to combine the best of both worlds. We demonstrate that both off- and on-policy methods benefit from this approach through experimental comparison of DQN, DDPG, and TRPO on high-dimensional discrete action environments as well as continuous control tasks.", "keywords": "reinforcement learning;exploration;parameter noise", "primary_area": "", "supplementary_material": "", "author": "Matthias Plappert;Rein Houthooft;Prafulla Dhariwal;Szymon Sidor;Richard Y. Chen;Xi Chen;Tamim Asfour;Pieter Abbeel;Marcin Andrychowicz", "authorids": "matthiasplappert@me.com;rein.houthooft@openai.com;prafulla@openai.com;szymon@openai.com;richardchen@openai.com;peter@openai.com;asfour@kit.edu;pabbeel@cs.berkeley.edu;marcin@openai.com", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "@inproceedings{\nplappert2018parameter,\ntitle={Parameter Space Noise for Exploration},\nauthor={Matthias Plappert and Rein Houthooft and Prafulla Dhariwal and Szymon Sidor and Richard Y. Chen and Xi Chen and Tamim Asfour and Pieter Abbeel and Marcin Andrychowicz},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ByBAl2eAZ},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 10 community implementations](https://paperswithcode.com/paper/?openreview=ByBAl2eAZ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;5;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 12, "authors#_avg": 9, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 811, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5517640716015156114&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "openreview": "https://openreview.net/forum?id=ByBAl2eAZ", "pdf": "https://openreview.net/pdf?id=ByBAl2eAZ", "email": ";;;;;;;;", "author_num": 9 }, { "id": "ByCPHrgCW", "title": "Deep Learning Inferences with Hybrid Homomorphic Encryption", "track": "main", "status": "Reject", "tldr": "We made a feature-rich system for deep learning with encrypted inputs, producing encrypted outputs, preserving privacy.", "abstract": "When deep learning is applied to sensitive data sets, many privacy-related implementation issues arise. These issues are especially evident in the healthcare, finance, law and government industries. Homomorphic encryption could allow a server to make inferences on inputs encrypted by a client, but to our best knowledge, there has been no complete implementation of common deep learning operations, for arbitrary model depths, using homomorphic encryption. This paper demonstrates a novel approach, efficiently implementing many deep learning functions with bootstrapped homomorphic encryption. As part of our implementation, we demonstrate Single and Multi-Layer Neural Networks, for the Wisconsin Breast Cancer dataset, as well as a Convolutional Neural Network for MNIST. Our results give promising directions for privacy-preserving representation learning, and the return of data control to users.\n\n", "keywords": "deep learning;homomorphic encryption;hybrid homomorphic encryption;privacy preserving;representation learning;neural networks", "primary_area": "", "supplementary_material": "", "author": "Anthony Meehan;Ryan K L Ko;Geoff Holmes", "authorids": "anthonymeehan@anthonymeehan.com;ryan.ko@waikato.ac.nz;geoff@waikato.ac.nz", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmeehan2018deep,\ntitle={Deep Learning Inferences with Hybrid Homomorphic Encryption},\nauthor={Anthony Meehan and Ryan K L Ko and Geoff Holmes},\nyear={2018},\nurl={https://openreview.net/forum?id=ByCPHrgCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ByCPHrgCW", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;5;5", "rating_avg": 4.0, "confidence_avg": 4.666666666666667, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1535280204671071617&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "ByED-X-0W", "title": "Parametric Information Bottleneck to Optimize Stochastic Neural Networks", "track": "main", "status": "Reject", "tldr": "Learning a better neural networks' representation with Information Bottleneck principle", "abstract": "In this paper, we present a layer-wise learning of stochastic neural networks (SNNs) in an information-theoretic perspective. In each layer of an SNN, the compression and the relevance are defined to quantify the amount of information that the layer contains about the input space and the target space, respectively. We jointly optimize the compression and the relevance of all parameters in an SNN to better exploit the neural network's representation. Previously, the Information Bottleneck (IB) framework (\\cite{Tishby99}) extracts relevant information for a target variable. Here, we propose Parametric Information Bottleneck (PIB) for a neural network by utilizing (only) its model parameters explicitly to approximate the compression and the relevance. We show that, as compared to the maximum likelihood estimate (MLE) principle, PIBs : (i) improve the generalization of neural networks in classification tasks, (ii) push the representation of neural networks closer to the optimal information-theoretical representation in a faster manner. ", "keywords": "Information Bottleneck;Deep Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Thanh T. Nguyen;Jaesik Choi", "authorids": "thanhnguyen2792@gmail.com;jaesik@unist.ac.kr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nt.2018parametric,\ntitle={Parametric Information Bottleneck to \\\\Optimize Stochastic Neural Networks},\nauthor={Thanh T. Nguyen and Jaesik Choi},\nyear={2018},\nurl={https://openreview.net/forum?id=ByED-X-0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ByED-X-0W", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9690439773821374416&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "ByJ7obb0b", "title": "Understanding and Exploiting the Low-Rank Structure of Deep Networks", "track": "main", "status": "Reject", "tldr": "We show that deep learning network derivatives have a low-rank structure, and this structure allows us to use second-order derivative information to calculate learning rates adaptively and in a computationally feasible manner.", "abstract": "Training methods for deep networks are primarily variants on stochastic gradient descent. Techniques that use (approximate) second-order information are rarely used because of the computational cost and noise associated with those approaches in deep learning contexts. However, in this paper, we show how feedforward deep networks exhibit a low-rank derivative structure. This low-rank structure makes it possible to use second-order information without needing approximations and without incurring a significantly greater computational cost than gradient descent. To demonstrate this capability, we implement Cubic Regularization (CR) on a feedforward deep network with stochastic gradient descent and two of its variants. There, we use CR to calculate learning rates on a per-iteration basis while training on the MNIST and CIFAR-10 datasets. CR proved particularly successful in escaping plateau regions of the objective function. We also found that this approach requires less problem-specific information (e.g. an optimal initial learning rate) than other first-order methods in order to perform well.", "keywords": "Deep Learning;Derivative Calculations;Optimization Algorithms", "primary_area": "", "supplementary_material": "", "author": "Craig Bakker;Michael J. Henry;Nathan O. Hodas", "authorids": "craig.bakker@pnnl.gov;michael.j.henry@pnnl.gov;nathan.hodas@pnnl.gov", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbakker2018understanding,\ntitle={Understanding and Exploiting the Low-Rank Structure of Deep Networks},\nauthor={Craig Bakker and Michael J. Henry and Nathan O. Hodas},\nyear={2018},\nurl={https://openreview.net/forum?id=ByJ7obb0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ByJ7obb0b", "pdf_size": 0, "rating": "2;4;5", "confidence": "4;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3268712259400074631&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "ByJDAIe0b", "title": "Integrating Episodic Memory into a Reinforcement Learning Agent Using Reservoir Sampling", "track": "main", "status": "Reject", "tldr": "External memory for online reinforcement learning based on estimating gradients over a novel reservoir sampling technique.", "abstract": "Episodic memory is a psychology term which refers to the ability to recall specific events from the past. We suggest one advantage of this particular type of memory is the ability to easily assign credit to a specific state when remembered information is found to be useful. Inspired by this idea, and the increasing popularity of external memory mechanisms to handle long-term dependencies in deep learning systems, we propose a novel algorithm which uses a reservoir sampling procedure to maintain an external memory consisting of a fixed number of past states. The algorithm allows a deep reinforcement learning agent to learn online to preferentially remember those states which are found to be useful to recall later on. Critically this method allows for efficient online computation of gradient estimates with respect to the write process of the external memory. Thus unlike most prior mechanisms for external memory it is feasible to use in an online reinforcement learning setting.\n", "keywords": "reinforcement learning;external memory;deep learning;policy gradient;online learning", "primary_area": "", "supplementary_material": "", "author": "Kenny J. Young;Shuo Yang;Richard S. Sutton", "authorids": "kjyoung@ualberta.ca;rsutton@ualberta.ca;s-yan14@mails.tsinghua.edu.cn", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nj.2018integrating,\ntitle={Integrating Episodic Memory into a Reinforcement Learning Agent Using Reservoir Sampling},\nauthor={Kenny J. Young and Shuo Yang and Richard S. Sutton},\nyear={2018},\nurl={https://openreview.net/forum?id=ByJDAIe0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ByJDAIe0b", "pdf_size": 0, "rating": "4;4;4", "confidence": "3;4;3", "rating_avg": 4.0, "confidence_avg": 3.3333333333333335, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6598828558122767538&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "title": "On the State of the Art of Evaluation in Neural Language Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/214", "id": "ByJHuTgA-", "author_site": "G\u00e1bor Melis, Chris Dyer, Phil Blunsom", "tldr": "Show that LSTMs are as good or better than recent innovations for LM and that model evaluation is often unreliable.", "abstract": "Ongoing innovations in recurrent neural network architectures have provided a steady influx of apparently state-of-the-art results on language modelling benchmarks. However, these have been evaluated using differing codebases and limited computational resources, which represent uncontrolled sources of experimental variation. We reevaluate several popular architectures and regularisation methods with large-scale automatic black-box hyperparameter tuning and arrive at the somewhat surprising conclusion that standard LSTM architectures, when properly regularised, outperform more recent models. We establish a new state of the art on the Penn Treebank and Wikitext-2 corpora, as well as strong baselines on the Hutter Prize dataset.\n", "keywords": "rnn;language modelling", "primary_area": "", "supplementary_material": "", "author": "G\u00e1bor Melis;Chris Dyer;Phil Blunsom", "authorids": "melisgl@google.com;cdyer@cs.cmu.edu;phil.blunsom@cs.ox.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nmelis2018on,\ntitle={On the State of the Art of Evaluation in Neural Language Models},\nauthor={G\u00e1bor Melis and Chris Dyer and Phil Blunsom},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ByJHuTgA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "5;7;8", "confidence": "5;2;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 20, "authors#_avg": 3, "corr_rating_confidence": -0.7857142857142858, "gs_citation": 678, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10520579957359692654&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=ByJHuTgA-", "pdf": "https://openreview.net/pdf?id=ByJHuTgA-", "email": ";;", "author_num": 3 }, { "title": "Automatically Inferring Data Quality for Spatiotemporal Forecasting", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/327", "id": "ByJIWUnpW", "author_site": "Sungyong Seo, Arash Mohegh, George Ban-Weiss, Yan Liu", "tldr": "We propose a method that infers the time-varying data quality level for spatiotemporal forecasting without explicitly assigned labels.", "abstract": "Spatiotemporal forecasting has become an increasingly important prediction task in machine learning and statistics due to its vast applications, such as climate modeling, traffic prediction, video caching predictions, and so on. While numerous studies have been conducted, most existing works assume that the data from different sources or across different locations are equally reliable. Due to cost, accessibility, or other factors, it is inevitable that the data quality could vary, which introduces significant biases into the model and leads to unreliable prediction results. The problem could be exacerbated in black-box prediction models, such as deep neural networks. In this paper, we propose a novel solution that can automatically infer data quality levels of different sources through local variations of spatiotemporal signals without explicit labels. Furthermore, we integrate the estimate of data quality level with graph convolutional networks to exploit their efficient structures. We evaluate our proposed method on forecasting temperatures in Los Angeles.", "keywords": "spatiotemporal data;graph convolutional network;data quality", "primary_area": "", "supplementary_material": "", "author": "Sungyong Seo;Arash Mohegh;George Ban-Weiss;Yan Liu", "authorids": "sungyons@usc.edu;mohegh@usc.edu;banweiss@usc.edu;yanliu.cs@usc.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nseo2018automatically,\ntitle={Automatically Inferring Data Quality for Spatiotemporal Forecasting},\nauthor={Sungyong Seo and Arash Mohegh and George Ban-Weiss and Yan Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ByJIWUnpW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;8", "confidence": "3;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12024707960689250283&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=ByJIWUnpW", "pdf": "https://openreview.net/pdf?id=ByJIWUnpW", "email": ";;;", "author_num": 4 }, { "id": "ByJWeR1AW", "title": "Data augmentation instead of explicit regularization", "track": "main", "status": "Reject", "tldr": "In a deep convolutional neural network trained with sufficient level of data augmentation, optimized by SGD, explicit regularizers (weight decay and dropout) might not provide any additional generalization improvement.", "abstract": "Modern deep artificial neural networks have achieved impressive results through models with very large capacity---compared to the number of training examples---that control overfitting with the help of different forms of regularization. Regularization can be implicit, as is the case of stochastic gradient descent or parameter sharing in convolutional layers, or explicit. Most common explicit regularization techniques, such as dropout and weight decay, reduce the effective capacity of the model and typically require the use of deeper and wider architectures to compensate for the reduced capacity. Although these techniques have been proven successful in terms of results, they seem to waste capacity. In contrast, data augmentation techniques reduce the generalization error by increasing the number of training examples and without reducing the effective capacity. In this paper we systematically analyze the effect of data augmentation on some popular architectures and conclude that data augmentation alone---without any other explicit regularization techniques---can achieve the same performance or higher as regularized models, especially when training with fewer examples.", "keywords": "deep learning;data augmentation;regularization", "primary_area": "", "supplementary_material": "", "author": "Alex Hern\u00e1ndez-Garc\u00eda;Peter K\u00f6nig", "authorids": "alexhg15@gmail.com;pkoenig@uos.de", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nhern\u00e1ndez-garc\u00eda2018data,\ntitle={Data augmentation instead of explicit regularization},\nauthor={Alex Hern\u00e1ndez-Garc\u00eda and Peter K\u00f6nig},\nyear={2018},\nurl={https://openreview.net/forum?id=ByJWeR1AW},\n}", "github": "[![github](/images/github_icon.svg) alexhernandezgarcia/data-aug-invariance](https://github.com/alexhernandezgarcia/data-aug-invariance) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=ByJWeR1AW)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ByJWeR1AW", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;4;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 215, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18176253708979082271&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "ByJbJwxCW", "title": "Relational Multi-Instance Learning for Concept Annotation from Medical Time Series", "track": "main", "status": "Reject", "tldr": "We propose a deep Multi Instance Learning framework based on recurrent neural networks which uses pooling functions and attention mechanisms for the concept annotation tasks.", "abstract": "Recent advances in computing technology and sensor design have made it easier to collect longitudinal or time series data from patients, resulting in a gigantic amount of available medical data. Most of the medical time series lack annotations or even when the annotations are available they could be subjective and prone to human errors. Earlier works have developed natural language processing techniques to extract concept annotations and/or clinical narratives from doctor notes. However, these approaches are slow and do not use the accompanying medical time series data. To address this issue, we introduce the problem of concept annotation for the medical time series data, i.e., the task of predicting and localizing medical concepts by using the time series data as input. We propose Relational Multi-Instance Learning (RMIL) - a deep Multi Instance Learning framework based on recurrent neural networks, which uses pooling functions and attention mechanisms for the concept annotation tasks. Empirical results on medical datasets show that our proposed models outperform various multi-instance learning models.", "keywords": "Multi-instance learning;Medical Time Series;Concept Annotation", "primary_area": "", "supplementary_material": "", "author": "Sanjay Purushotham;Zhengping Che;Bo Jiang;Tanachat Nilanon;Yan Liu", "authorids": "spurusho@usc.edu;zche@usc.edu;boj@usc.edu;nilanon@usc.edu;yanliu.cs@usc.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\npurushotham2018relational,\ntitle={Relational Multi-Instance Learning for Concept Annotation from Medical Time Series},\nauthor={Sanjay Purushotham and Zhengping Che and Bo Jiang and Tanachat Nilanon and Yan Liu},\nyear={2018},\nurl={https://openreview.net/forum?id=ByJbJwxCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ByJbJwxCW", "pdf_size": 0, "rating": "3;3;6", "confidence": "3;5;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:aStocXexJBIJ:scholar.google.com/&scioq=Relational+Multi-Instance+Learning+for+Concept+Annotation+from+Medical+Time+Series&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "GANITE: Estimation of Individualized Treatment Effects using Generative Adversarial Nets", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/153", "id": "ByKWUeWA-", "author_site": "Jinsung Yoon, James Jordan, Mihaela v Schaar", "tldr": "", "abstract": "Estimating individualized treatment effects (ITE) is a challenging task due to the need for an individual's potential outcomes to be learned from biased data and without having access to the counterfactuals. We propose a novel method for inferring ITE based on the Generative Adversarial Nets (GANs) framework. Our method, termed Generative Adversarial Nets for inference of Individualized Treatment Effects (GANITE), is motivated by the possibility that we can capture the uncertainty in the counterfactual distributions by attempting to learn them using a GAN. We generate proxies of the counterfactual outcomes using a counterfactual generator, G, and then pass these proxies to an ITE generator, I, in order to train it. By modeling both of these using the GAN framework, we are able to infer based on the factual data, while still accounting for the unseen counterfactuals. We test our method on three real-world datasets (with both binary and multiple treatments) and show that GANITE outperforms state-of-the-art methods.", "keywords": "Individualized Treatment Effects;Counterfactual Estimation;Generative Adversarial Nets", "primary_area": "", "supplementary_material": "", "author": "Jinsung Yoon;James Jordon;Mihaela van der Schaar", "authorids": "jsyoon0823@gmail.com;james.jordon@hertford.ox.ac.uk;mihaela.vanderschaar@oxford-man.ox.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nyoon2018ganite,\ntitle={{GANITE}: Estimation of Individualized Treatment Effects using Generative Adversarial Nets},\nauthor={Jinsung Yoon and James Jordon and Mihaela van der Schaar},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ByKWUeWA-},\n}", "github": "[![github](/images/github_icon.svg) vanderschaarlab/mlforhealthlabpub](https://github.com/vanderschaarlab/mlforhealthlabpub/tree/main/alg/ganite) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=ByKWUeWA-)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;4;3", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 536, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=880825106986572029&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=ByKWUeWA-", "pdf": "https://openreview.net/pdf?id=ByKWUeWA-", "email": ";;", "author_num": 3 }, { "id": "ByL48G-AW", "title": "Simple Nearest Neighbor Policy Method for Continuous Control Tasks", "track": "main", "status": "Reject", "tldr": "", "abstract": "We design a new policy, called a nearest neighbor policy, that does not require any optimization for simple, low-dimensional continuous control tasks. As this policy does not require any optimization, it allows us to investigate the underlying difficulty of a task without being distracted by optimization difficulty of a learning algorithm. We propose two variants, one that retrieves an entire trajectory based on a pair of initial and goal states, and the other retrieving a partial trajectory based on a pair of current and goal states. We test the proposed policies on five widely-used benchmark continuous control tasks with a sparse reward: Reacher, Half Cheetah, Double Pendulum, Cart Pole and Mountain Car. We observe that the majority (the first four) of these tasks, which have been considered difficult, are easily solved by the proposed policies with high success rates, indicating that reported difficulties of them may have likely been due to the optimization difficulty. Our work suggests that it is necessary to evaluate any sophisticated policy learning algorithm on more challenging problems in order to truly assess the advances from them.", "keywords": "nearest neighbor;reinforcement learning;policy;continuous control", "primary_area": "", "supplementary_material": "", "author": "Elman Mansimov;Kyunghyun Cho", "authorids": "mansimov@cs.nyu.edu;kyunghyun.cho@nyu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nmansimov2018simple,\ntitle={Simple Nearest Neighbor Policy Method for Continuous Control Tasks},\nauthor={Elman Mansimov and Kyunghyun Cho},\nyear={2018},\nurl={https://openreview.net/forum?id=ByL48G-AW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ByL48G-AW", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;5;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.666666666666667, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11548226737838649779&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "title": "MaskGAN: Better Text Generation via Filling in the _______", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/10", "id": "ByOExmWAb", "author_site": "William Fedus, Ian Goodfellow, Andrew Dai", "tldr": "Natural language GAN for filling in the blank", "abstract": "Neural text generation models are often autoregressive language models or seq2seq models. Neural autoregressive and seq2seq models that generate text by sampling words sequentially, with each word conditioned on the previous model, are state-of-the-art for several machine translation and summarization benchmarks. These benchmarks are often defined by validation perplexity even though this is not a direct measure of sample quality. Language models are typically trained via maximum likelihood and most often with teacher forcing. Teacher forcing is well-suited to optimizing perplexity but can result in poor sample quality because generating text requires conditioning on sequences of words that were never observed at training time. We propose to improve sample quality using Generative Adversarial Network (GANs), which explicitly train the generator to produce high quality samples and have shown a lot of success in image generation. GANs were originally to designed to output differentiable values, so discrete language generation is challenging for them. We introduce an actor-critic conditional GAN that fills in missing text conditioned on the surrounding context. We show qualitatively and quantitatively, evidence that this produces more realistic text samples compared to a maximum likelihood trained model.", "keywords": "Deep learning;GAN", "primary_area": "", "supplementary_material": "", "author": "William Fedus;Ian Goodfellow;Andrew M. Dai", "authorids": "liam.fedus@gmail.com;goodfellow@google.com;adai@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nfedus2018maskgan,\ntitle={Mask{GAN}: Better Text Generation via Filling in the _______},\nauthor={William Fedus and Ian Goodfellow and Andrew M. Dai},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ByOExmWAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;5;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 661, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8054442901795858629&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=ByOExmWAb", "pdf": "https://openreview.net/pdf?id=ByOExmWAb", "email": ";;", "author_num": 3 }, { "title": "Detecting Statistical Interactions from Neural Network Weights", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/285", "id": "ByOfBggRZ", "author_site": "Michael Tsang, Dehua Cheng, Yan Liu", "tldr": "We detect statistical interactions captured by a feedforward multilayer neural network by directly interpreting its learned weights.", "abstract": "Interpreting neural networks is a crucial and challenging task in machine learning. In this paper, we develop a novel framework for detecting statistical interactions captured by a feedforward multilayer neural network by directly interpreting its learned weights. Depending on the desired interactions, our method can achieve significantly better or similar interaction detection performance compared to the state-of-the-art without searching an exponential solution space of possible interactions. We obtain this accuracy and efficiency by observing that interactions between input features are created by the non-additive effect of nonlinear activation functions, and that interacting paths are encoded in weight matrices. We demonstrate the performance of our method and the importance of discovered interactions via experimental results on both synthetic datasets and real-world application datasets. ", "keywords": "statistical interaction detection;multilayer perceptron;generalized additive model", "primary_area": "", "supplementary_material": "", "author": "Michael Tsang;Dehua Cheng;Yan Liu", "authorids": "tsangm@usc.edu;dehuache@usc.edu;yanliu.cs@usc.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ntsang2018detecting,\ntitle={Detecting Statistical Interactions from Neural Network Weights},\nauthor={Michael Tsang and Dehua Cheng and Yan Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ByOfBggRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;4;5", "rating_avg": 7.0, "confidence_avg": 4.333333333333333, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 243, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2052128635876795940&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=ByOfBggRZ", "pdf": "https://openreview.net/pdf?id=ByOfBggRZ", "email": ";;", "author_num": 3 }, { "title": "Policy Optimization by Genetic Distillation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/160", "id": "ByOnmlWC-", "author_site": "Tanmay Gangwani, Jian Peng", "tldr": "Genetic algorithms based approach for optimizing deep neural network policies", "abstract": "Genetic algorithms have been widely used in many practical optimization problems.\nInspired by natural selection, operators, including mutation, crossover\nand selection, provide effective heuristics for search and black-box optimization.\nHowever, they have not been shown useful for deep reinforcement learning, possibly\ndue to the catastrophic consequence of parameter crossovers of neural networks.\nHere, we present Genetic Policy Optimization (GPO), a new genetic algorithm\nfor sample-efficient deep policy optimization. GPO uses imitation learning\nfor policy crossover in the state space and applies policy gradient methods for mutation.\nOur experiments on MuJoCo tasks show that GPO as a genetic algorithm\nis able to provide superior performance over the state-of-the-art policy gradient\nmethods and achieves comparable or higher sample efficiency.", "keywords": "Genetic algorithms;deep reinforcement learning;imitation learning", "primary_area": "", "supplementary_material": "", "author": "Tanmay Gangwani;Jian Peng", "authorids": "gangwan2@illinois.edu;jianpeng@illinois.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\ngangwani2018genetic,\ntitle={Genetic Policy Optimization},\nauthor={Tanmay Gangwani and Jian Peng},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ByOnmlWC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "3;6;8", "confidence": "4;4;5", "rating_avg": 5.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 13, "authors#_avg": 2, "corr_rating_confidence": 0.8029550685469661, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15865429429504439749&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=ByOnmlWC-", "pdf": "https://openreview.net/pdf?id=ByOnmlWC-", "email": ";", "author_num": 2 }, { "id": "ByQZjx-0-", "title": "Faster Discovery of Neural Architectures by Searching for Paths in a Large Model", "track": "main", "status": "Workshop", "tldr": "An approach that speeds up neural architecture search by 10x, whilst using 100x less computing resource.", "abstract": "We propose Efficient Neural Architecture Search (ENAS), a faster and less expensive approach to automated model design than previous methods. In ENAS, a controller learns to discover neural network architectures by searching for an optimal path within a larger model. The controller is trained with policy gradient to select a path that maximizes the expected reward on the validation set. Meanwhile the model corresponding to the selected path is trained to minimize the cross entropy loss. On the Penn Treebank dataset, ENAS can discover a novel architecture thats achieves a test perplexity of 57.8, which is state-of-the-art among automatic model design methods on Penn Treebank. On the CIFAR-10 dataset, ENAS can design novel architectures that achieve a test error of 2.89%, close to the 2.65% achieved by standard NAS (Zoph et al., 2017). Most importantly, our experiments show that ENAS is more than 10x faster and 100x less resource-demanding than NAS.", "keywords": "neural architecture search", "primary_area": "", "supplementary_material": "", "author": "Hieu Pham;Melody Y. Guan;Barret Zoph;Quoc V. Le;Jeff Dean", "authorids": "hyhieu@cmu.edu;mguan@stanford.edu;barretzoph@google.com;qvl@google.com;jeff@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\npham2018faster,\ntitle={Faster Discovery of Neural Architectures by Searching for Paths in a Large Model},\nauthor={Hieu Pham and Melody Y. Guan and Barret Zoph and Quoc V. Le and Jeff Dean},\nyear={2018},\nurl={https://openreview.net/forum?id=ByQZjx-0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ByQZjx-0-", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;2;2", "rating_avg": 5.333333333333333, "confidence_avg": 2.3333333333333335, "replies_avg": 22, "authors#_avg": 5, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=286449918794248844&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Many Paths to Equilibrium: GANs Do Not Need to Decrease a Divergence At Every Step", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/180", "id": "ByQpn1ZA-", "author_site": "William Fedus, Mihaela Rosca, Balaji Lakshminarayanan, Andrew Dai, Shakir Mohamed, Ian Goodfellow", "tldr": "We find evidence that divergence minimization may not be an accurate characterization of GAN training.", "abstract": "Generative adversarial networks (GANs) are a family of generative models that do not minimize a single training criterion. Unlike other generative models, the data distribution is learned via a game between a generator (the generative model) and a discriminator (a teacher providing training signal) that each minimize their own cost. GANs are designed to reach a Nash equilibrium at which each player cannot reduce their cost without changing the other players\u2019 parameters. One useful approach for the theory of GANs is to show that a divergence between the training distribution and the model distribution obtains its minimum value at equilibrium. Several recent research directions have been motivated by the idea that this divergence is the primary guide for the learning process and that every step of learning should decrease the divergence. We show that this view is overly restrictive. During GAN training, the discriminator provides learning signal in situations where the gradients of the divergences between distributions would not be useful. We provide empirical counterexamples to the view of GAN training as divergence minimization. Specifically, we demonstrate that GANs are able to learn distributions in situations where the divergence minimization point of view predicts they would fail. We also show that gradient penalties motivated from the divergence minimization perspective are equally helpful when applied in other contexts in which the divergence minimization perspective does not predict they would be helpful. This contributes to a growing body of evidence that GAN training may be more usefully viewed as approaching Nash equilibria via trajectories that do not necessarily minimize a specific divergence at each step.", "keywords": "Deep learning;GAN", "primary_area": "", "supplementary_material": "", "author": "William Fedus*;Mihaela Rosca*;Balaji Lakshminarayanan;Andrew M. Dai;Shakir Mohamed;Ian Goodfellow", "authorids": "liam.fedus@gmail.com;mihaelacr@google.com;balajiln@google.com;adai@google.com;shakir@google.com;goodfellow@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nfedus*2018many,\ntitle={Many Paths to Equilibrium: {GAN}s Do Not Need to Decrease a Divergence At Every Step},\nauthor={William Fedus* and Mihaela Rosca* and Balaji Lakshminarayanan and Andrew M. Dai and Shakir Mohamed and Ian Goodfellow},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ByQpn1ZA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "4;7;8", "confidence": "4;3;5", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 17, "authors#_avg": 6, "corr_rating_confidence": 0.2401922307076307, "gs_citation": 265, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15921455512138469908&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=ByQpn1ZA-", "pdf": "https://openreview.net/pdf?id=ByQpn1ZA-", "email": ";;;;;", "author_num": 6 }, { "title": "Learning to cluster in order to transfer across domains and tasks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/333", "id": "ByRWCqvT-", "author_site": "Yen-Chang Hsu, Zhaoyang Lv, Zsolt Kira", "tldr": "A learnable clustering objective to facilitate transfer learning across domains and tasks", "abstract": "This paper introduces a novel method to perform transfer learning across domains and tasks, formulating it as a problem of learning to cluster. The key insight is that, in addition to features, we can transfer similarity information and this is sufficient to learn a similarity function and clustering network to perform both domain adaptation and cross-task transfer learning. We begin by reducing categorical information to pairwise constraints, which only considers whether two instances belong to the same class or not (pairwise semantic similarity). This similarity is category-agnostic and can be learned from data in the source domain using a similarity network. We then present two novel approaches for performing transfer learning using this similarity function. First, for unsupervised domain adaptation, we design a new loss function to regularize classification with a constrained clustering loss, hence learning a clustering network with the transferred similarity metric generating the training inputs. Second, for cross-task learning (i.e., unsupervised clustering with unseen categories), we propose a framework to reconstruct and estimate the number of semantic clusters, again using the clustering network. Since the similarity network is noisy, the key is to use a robust clustering algorithm, and we show that our formulation is more robust than the alternative constrained and unconstrained clustering approaches. Using this method, we first show state of the art results for the challenging cross-task problem, applied on Omniglot and ImageNet. Our results show that we can reconstruct semantic clusters with high accuracy. We then evaluate the performance of cross-domain transfer using images from the Office-31 and SVHN-MNIST tasks and present top accuracy on both datasets. Our approach doesn't explicitly deal with domain discrepancy. If we combine with a domain adaptation loss, it shows further improvement.", "keywords": "transfer learning;similarity prediction;clustering;domain adaptation;unsupervised learning;computer vision;deep learning;constrained clustering", "primary_area": "", "supplementary_material": "", "author": "Yen-Chang Hsu;Zhaoyang Lv;Zsolt Kira", "authorids": "yenchang.hsu@gatech.edu;zhaoyang.lv@gatech.edu;zkira@gatech.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nhsu2018learning,\ntitle={Learning to cluster in order to transfer across domains and tasks},\nauthor={Yen-Chang Hsu and Zhaoyang Lv and Zsolt Kira},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ByRWCqvT-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "5;7;9", "confidence": "4;4;5", "rating_avg": 7.0, "confidence_avg": 4.333333333333333, "replies_avg": 12, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 278, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5063814464550490029&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=ByRWCqvT-", "pdf": "https://openreview.net/pdf?id=ByRWCqvT-", "email": ";;", "author_num": 3 }, { "title": "cGANs with Projection Discriminator", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/217", "id": "ByS1VpgRZ", "author_site": "Takeru Miyato, Masanori Koyama", "tldr": "We propose a novel, projection based way to incorporate the conditional information into the discriminator of GANs that respects the role of the conditional information in the underlining probabilistic model.", "abstract": "We propose a novel, projection based way to incorporate the conditional information into the discriminator of GANs that respects the role of the conditional information in the underlining probabilistic model. \nThis approach is in contrast with most frameworks of conditional GANs used in application today, which use the conditional information by concatenating the (embedded) conditional vector to the feature vectors. \nWith this modification, we were able to significantly improve the quality of the class conditional image generation on ILSVRC2012 (ImageNet) dataset from the current state-of-the-art result, and we achieved this with a single pair of a discriminator and a generator. \nWe were also able to extend the application to super-resolution and succeeded in producing highly discriminative super-resolution images. \nThis new structure also enabled high quality category transformation based on parametric functional transformation of conditional batch normalization layers in the generator.", "keywords": "Generative Adversarial Networks;GANs;conditional GANs;Generative models;Projection", "primary_area": "", "supplementary_material": "", "author": "Takeru Miyato;Masanori Koyama", "authorids": "miyato@preferred.jp;koyama.masanori@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nmiyato2018cgans,\ntitle={c{GAN}s with Projection Discriminator},\nauthor={Takeru Miyato and Masanori Koyama},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ByS1VpgRZ},\n}", "github": "[![github](/images/github_icon.svg) pfnet-research/sngan_projection](https://github.com/pfnet-research/sngan_projection) + [![Papers with Code](/images/pwc_icon.svg) 11 community implementations](https://paperswithcode.com/paper/?openreview=ByS1VpgRZ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;5", "rating_avg": 6.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 14, "authors#_avg": 2, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 694, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2383594201116967790&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=ByS1VpgRZ", "pdf": "https://openreview.net/pdf?id=ByS1VpgRZ", "email": ";", "author_num": 2 }, { "title": "Learning Discrete Weights Using the Local Reparameterization Trick", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/314", "id": "BySRH6CpW", "author_site": "Oran Shayer, Dan Levi, Ethan Fetaya", "tldr": "Training binary/ternary networks using local reparameterization with the CLT approximation", "abstract": "Recent breakthroughs in computer vision make use of large deep neural networks, utilizing the substantial speedup offered by GPUs. For applications running on limited hardware, however, high precision real-time processing can still be a challenge. One approach to solving this problem is training networks with binary or ternary weights, thus removing the need to calculate multiplications and significantly reducing memory size. In this work, we introduce LR-nets (Local reparameterization networks), a new method for training neural networks with discrete weights using stochastic parameters. We show how a simple modification to the local reparameterization trick, previously used to train Gaussian distributed weights, enables the training of discrete weights. Using the proposed training we test both binary and ternary models on MNIST, CIFAR-10 and ImageNet benchmarks and reach state-of-the-art results on most experiments.", "keywords": "deep learning;discrete weight network", "primary_area": "", "supplementary_material": "", "author": "Oran Shayer;Dan Levi;Ethan Fetaya", "authorids": "oran.sh@gmail.com;dan.levi@gm.com;ethanf@cs.toronto.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nshayer2018learning,\ntitle={Learning Discrete Weights Using the Local Reparameterization Trick},\nauthor={Oran Shayer and Dan Levi and Ethan Fetaya},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BySRH6CpW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;3;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 164, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13612900958005340664&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=BySRH6CpW", "pdf": "https://openreview.net/pdf?id=BySRH6CpW", "email": ";;", "author_num": 3 }, { "id": "ByUEelW0-", "title": "Modifying memories in a Recurrent Neural Network Unit", "track": "main", "status": "Reject", "tldr": "Adding a new set of weights to the LSTM that rotate the cell memory improves performance on some bAbI tasks.", "abstract": "Long Short-Term Memory (LSTM) units have the ability to memorise and use long-term dependencies between inputs to generate predictions on time series data. We introduce the concept of modifying the cell state (memory) of LSTMs using rotation matrices parametrised by a new set of trainable weights. This addition shows significant increases of performance on some of the tasks from the bAbI dataset.", "keywords": "LSTM;RNN;rotation matrix;long-term memory;natural language processing", "primary_area": "", "supplementary_material": "", "author": "Vlad Velici;Adam Pr\u00fcgel-Bennett", "authorids": "vsv1g12@soton.ac.uk;apb@soton.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nvelici2018modifying,\ntitle={Modifying memories in a Recurrent Neural Network Unit},\nauthor={Vlad Velici and Adam Pr\u00fcgel-Bennett},\nyear={2018},\nurl={https://openreview.net/forum?id=ByUEelW0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ByUEelW0-", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;3;3", "rating_avg": 3.6666666666666665, "confidence_avg": 3.3333333333333335, "replies_avg": 6, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-VehzqKJDHQJ:scholar.google.com/&scioq=Modifying+memories+in+a+Recurrent+Neural+Network+Unit&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "ByW5yxgA-", "title": "Multiscale Hidden Markov Models For Covariance Prediction", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper presents a novel variant of hierarchical hidden Markov models (HMMs), the multiscale hidden Markov model (MSHMM), and an associated spectral estimation and prediction scheme that is consistent, finds global optima, and is computationally efficient. Our MSHMM is a generative model of multiple HMMs evolving at different rates where the observation is a result of the additive emissions of the HMMs. While estimation is relatively straightforward, prediction for the MSHMM poses a unique challenge, which we address in this paper. Further, we show that spectral estimation of the MSHMM outperforms standard methods of predicting the asset covariance of stock prices, a widely addressed problem that is multiscale, non-stationary, and requires processing huge amounts of data.", "keywords": "multiscale models;hidden Markov model;covariance prediction", "primary_area": "", "supplementary_material": "", "author": "Jo\u00e3o Sedoc;Jordan Rodu;Dean Foster;Lyle Ungar", "authorids": "joao@cis.upenn.edu;jsr6q@virginia.edu;dean@foster.net;ungar@cis.upenn.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsedoc2018multiscale,\ntitle={Multiscale Hidden Markov Models For Covariance Prediction},\nauthor={Jo\u00e3o Sedoc and Jordan Rodu and Dean Foster and Lyle Ungar},\nyear={2018},\nurl={https://openreview.net/forum?id=ByW5yxgA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ByW5yxgA-", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12199778784117704016&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "ByYPLJA6W", "title": "Distribution Regression Network", "track": "main", "status": "Reject", "tldr": "A learning network which generalizes the MLP framework to perform distribution-to-distribution regression", "abstract": "We introduce our Distribution Regression Network (DRN) which performs regression from input probability distributions to output probability distributions. Compared to existing methods, DRN learns with fewer model parameters and easily extends to multiple input and multiple output distributions. On synthetic and real-world datasets, DRN performs similarly or better than the state-of-the-art. Furthermore, DRN generalizes the conventional multilayer perceptron (MLP). In the framework of MLP, each node encodes a real number, whereas in DRN, each node encodes a probability distribution. ", "keywords": "distribution regression;supervised learning;regression analysis", "primary_area": "", "supplementary_material": "", "author": "Connie Kou;Hwee Kuan Lee;Teck Khim Ng", "authorids": "koukl@comp.nus.edu.sg;leehk@bii.a-star.edu.sg;ngtk@comp.nus.edu.sg", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nkou2018distribution,\ntitle={Distribution Regression Network},\nauthor={Connie Kou and Hwee Kuan Lee and Teck Khim Ng},\nyear={2018},\nurl={https://openreview.net/forum?id=ByYPLJA6W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ByYPLJA6W", "pdf_size": 0, "rating": "5;7;7", "confidence": "4;4;2", "rating_avg": 6.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": -0.5, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "ByZmGjkA-", "title": "Understanding Grounded Language Learning Agents", "track": "main", "status": "Reject", "tldr": "Analysing and understanding how neural network agents learn to understand simple grounded language", "abstract": "Neural network-based systems can now learn to locate the referents of words and phrases in images, answer questions about visual scenes, and even execute symbolic instructions as first-person actors in partially-observable worlds. To achieve this so-called grounded language learning, models must overcome certain well-studied learning challenges that are also fundamental to infants learning their first words. While it is notable that models with no meaningful prior knowledge overcome these learning obstacles, AI researchers and practitioners currently lack a clear understanding of exactly how they do so. Here we address this question as a way of achieving a clearer general understanding of grounded language learning, both to inform future research and to improve confidence in model predictions. For maximum control and generality, we focus on a simple neural network-based language learning agent trained via policy-gradient methods to interpret synthetic linguistic instructions in a simulated 3D world. We apply experimental paradigms from developmental psychology to this agent, exploring the conditions under which established human biases and learning effects emerge. We further propose a novel way to visualise and analyse semantic representation in grounded language learning agents that yields a plausible computational account of the observed effects.", "keywords": "Language AI Learning Reinforcement Deep", "primary_area": "", "supplementary_material": "", "author": "Felix Hill;Karl Moritz Hermann;Phil Blunsom;Stephen Clark", "authorids": "felixhill@google.com;kmh@google.com;pblunsom@google.com;clarkstephen@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nhill2018understanding,\ntitle={Understanding Grounded Language Learning Agents},\nauthor={Felix Hill and Karl Moritz Hermann and Phil Blunsom and Stephen Clark},\nyear={2018},\nurl={https://openreview.net/forum?id=ByZmGjkA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ByZmGjkA-", "pdf_size": 0, "rating": "4;5;7", "confidence": "5;3;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": -0.3273268353539886, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14632733652110827088&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "Bya8fGWAZ", "title": "Value Propagation Networks", "track": "main", "status": "Workshop", "tldr": "We propose Value Propagation, a novel end-to-end planner which can learn to solve 2D navigation tasks via Reinforcement Learning, and that generalizes to larger and dynamic environments.", "abstract": "We present Value Propagation (VProp), a parameter-efficient differentiable planning module built on Value Iteration which can successfully be trained in a reinforcement learning fashion to solve unseen tasks, has the capability to generalize to larger map sizes, and can learn to navigate in dynamic environments. We evaluate on configurations of MazeBase grid-worlds, with randomly generated environments of several different sizes. Furthermore, we show that the module enables to learn to plan when the environment also includes stochastic elements, providing a cost-efficient learning system to build low-level size-invariant planners for a variety of interactive navigation problems.", "keywords": "Learning to plan;Reinforcement Learning;Value Iteration;Navigation;Convnets", "primary_area": "", "supplementary_material": "", "author": "Nantas Nardelli;Gabriel Synnaeve;Zeming Lin;Pushmeet Kohli;Nicolas Usunier", "authorids": "nantas@robots.ox.ac.uk;gab@fb.com;zlin@fb.com;pushmeet@google.com;usunier@fb.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nnardelli2018value,\ntitle={Value Propagation Networks},\nauthor={Nantas Nardelli and Gabriel Synnaeve and Zeming Lin and Pushmeet Kohli and Nicolas Usunier},\nyear={2018},\nurl={https://openreview.net/forum?id=Bya8fGWAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=Bya8fGWAZ", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;2;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.0, "replies_avg": 7, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9180230208406770561&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "ByaQIGg0-", "title": "AUTOMATED DESIGN USING NEURAL NETWORKS AND GRADIENT DESCENT", "track": "main", "status": "Workshop", "tldr": "A method for performing automated design on real world objects such as heat sinks and wing airfoils that makes use of neural networks and gradient descent.", "abstract": "We propose a novel method that makes use of deep neural networks and gradient decent to perform automated design on complex real world engineering tasks. Our approach works by training a neural network to mimic the fitness function of a design optimization task and then, using the differential nature of the neural network, perform gradient decent to maximize the fitness. We demonstrate this methods effectiveness by designing an optimized heat sink and both 2D and 3D airfoils that maximize the lift drag ratio under steady state flow conditions. We highlight that our method has two distinct benefits over other automated design approaches. First, evaluating the neural networks prediction of fitness can be orders of magnitude faster then simulating the system of interest. Second, using gradient decent allows the design space to be searched much more efficiently then other gradient free methods. These two strengths work together to overcome some of the current shortcomings of automated design.", "keywords": "Deep Learning;Automated Design;Gradient Descent", "primary_area": "", "supplementary_material": "", "author": "Oliver Hennigh", "authorids": "loliverhennigh101@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nhennigh2018automated,\ntitle={{AUTOMATED} {DESIGN} {USING} {NEURAL} {NETWORKS} {AND} {GRADIENT} {DESCENT}},\nauthor={Oliver Hennigh},\nyear={2018},\nurl={https://openreview.net/forum?id=ByaQIGg0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ByaQIGg0-", "pdf_size": 0, "rating": "4;5;7", "confidence": "5;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 1, "corr_rating_confidence": -0.7559289460184544, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4265610575771216491&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "BybQ7zWCb", "title": "\u201cStyle\u201d Transfer for Musical Audio Using Multiple Time-Frequency Representations", "track": "main", "status": "Reject", "tldr": "We present a long time-scale musical audio style transfer algorithm which synthesizes audio in the time-domain, but uses Time-Frequency representations of audio.", "abstract": "Neural Style Transfer has become a popular technique for\ngenerating images of distinct artistic styles using convolutional neural networks. This\nrecent success in image style transfer has raised the question of\nwhether similar methods can be leveraged to alter the \u201cstyle\u201d of musical\naudio. In this work, we attempt long time-scale high-quality audio transfer\nand texture synthesis in the time-domain that captures harmonic,\nrhythmic, and timbral elements related to musical style, using examples that\nmay have different lengths and musical keys. We demonstrate the ability\nto use randomly initialized convolutional neural networks to transfer\nthese aspects of musical style from one piece onto another using 3\ndifferent representations of audio: the log-magnitude of the Short Time\nFourier Transform (STFT), the Mel spectrogram, and the Constant-Q Transform\nspectrogram. We propose using these representations as a way of\ngenerating and modifying perceptually significant characteristics of\nmusical audio content. We demonstrate each representation's\nshortcomings and advantages over others by carefully designing\nneural network structures that complement the nature of musical audio. Finally, we show that the most\ncompelling \u201cstyle\u201d transfer examples make use of an ensemble of these\nrepresentations to help capture the varying desired characteristics of\naudio signals.", "keywords": "Musical audio;neural style transfer;Time-Frequency;Spectrogram", "primary_area": "", "supplementary_material": "", "author": "Shaun Barry;Youngmoo Kim", "authorids": "smb484@drexel.edu;ykim@drexel.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nbarry2018style,\ntitle={\u201cStyle\u201d Transfer for Musical Audio Using Multiple Time-Frequency Representations},\nauthor={Shaun Barry and Youngmoo Kim},\nyear={2018},\nurl={https://openreview.net/forum?id=BybQ7zWCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BybQ7zWCb", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;4;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 6, "authors#_avg": 2, "corr_rating_confidence": -0.7559289460184545, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11026592069341933196&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "Byd-EfWCb", "title": "Decoding Decoders: Finding Optimal Representation Spaces for Unsupervised Similarity Tasks", "track": "main", "status": "Workshop", "tldr": "By introducing the notion of an optimal representation space, we provide a theoretical argument and experimental validation that an unsupervised model for sentences can perform well on both supervised similarity and unsupervised transfer tasks.", "abstract": "Experimental evidence indicates that simple models outperform complex deep networks on many unsupervised similarity tasks. Introducing the concept of an optimal representation space, we provide a simple theoretical resolution to this apparent paradox. In addition, we present a straightforward procedure that, without any retraining or architectural modifications, allows deep recurrent models to perform equally well (and sometimes better) when compared to shallow models. To validate our analysis, we conduct a set of consistent empirical evaluations and introduce several new sentence embedding models in the process. Even though this work is presented within the context of natural language processing, the insights are readily applicable to other domains that rely on distributed representations for transfer tasks.", "keywords": "distributed representations;sentence embedding;representation learning;unsupervised learning;encoder-decoder;RNN", "primary_area": "", "supplementary_material": "", "author": "Vitalii Zhelezniak;Dan Busbridge;April Shen;Samuel L. Smith;Nils Y. Hammerla", "authorids": "vitali.zhelezniak@babylonhealth.com;dan.busbridge@babylonhealth.com;april.shen@babylonhealth.com;slsmith@google.com;nils.hammerla@babylonhealth.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nzhelezniak2018decoding,\ntitle={Decoding Decoders: Finding Optimal Representation Spaces for Unsupervised Similarity Tasks},\nauthor={Vitalii Zhelezniak and Dan Busbridge and April Shen and Samuel L. Smith and Nils Y. Hammerla},\nyear={2018},\nurl={https://openreview.net/forum?id=Byd-EfWCb},\n}", "github": "[![github](/images/github_icon.svg) Babylonpartners/decoding-decoders](https://github.com/Babylonpartners/decoding-decoders)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Byd-EfWCb", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;5;4", "rating_avg": 5.0, "confidence_avg": 4.333333333333333, "replies_avg": 10, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17960191900263632298&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "title": "Twin Networks: Matching the Future for Sequence Generation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/86", "id": "BydLzGb0Z", "author_site": "Dmitriy Serdyuk, Nan Rosemary Ke, Alessandro Sordoni, Adam Trischler, Christopher Pal, Yoshua Bengio", "tldr": "The paper introduces a method of training generative recurrent networks that helps to plan ahead. We run a second RNN in a reverse direction and make a soft constraint between cotemporal forward and backward states.", "abstract": "We propose a simple technique for encouraging generative RNNs to plan ahead. We train a ``backward'' recurrent network to generate a given sequence in reverse order, and we encourage states of the forward model to predict cotemporal states of the backward model. The backward network is used only during training, and plays no role during sampling or inference. We hypothesize that our approach eases modeling of long-term dependencies by implicitly forcing the forward states to hold information about the longer-term future (as contained in the backward states). We show empirically that our approach achieves 9% relative improvement for a speech recognition task, and achieves significant improvement on a COCO caption generation task.", "keywords": "generative rnns;long term dependencies;speech recognition;image captioning", "primary_area": "", "supplementary_material": "", "author": "Dmitriy Serdyuk;Nan Rosemary Ke;Alessandro Sordoni;Adam Trischler;Chris Pal;Yoshua Bengio", "authorids": "serdyuk.dmitriy@gmail.com;rosemary.nan.ke@gmail.com;alessandro.sordoni@gmail.com;adam.trischler@microsoft.com;chris.j.pal@gmail.com;yoshua.umontreal@gmail.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nserdyuk2018twin,\ntitle={Twin Networks: Matching the Future for Sequence Generation},\nauthor={Dmitriy Serdyuk and Nan Rosemary Ke and Alessandro Sordoni and Adam Trischler and Chris Pal and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BydLzGb0Z},\n}", "github": "[![github](/images/github_icon.svg) dmitriy-serdyuk/twin-net](https://github.com/dmitriy-serdyuk/twin-net) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=BydLzGb0Z)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;4;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18040787837429694230&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=BydLzGb0Z", "pdf": "https://openreview.net/pdf?id=BydLzGb0Z", "email": ";;;;;", "author_num": 6 }, { "title": "Towards Reverse-Engineering Black-Box Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/243", "id": "BydjJte0-", "author_site": "Seong Joon Oh, Max Augustin, Mario Fritz, Bernt Schiele", "tldr": "Querying a black-box neural network reveals a lot of information about it; we propose novel \"metamodels\" for effectively extracting information from a black box.", "abstract": "Many deployed learned models are black boxes: given input, returns output. Internal information about the model, such as the architecture, optimisation procedure, or training data, is not disclosed explicitly as it might contain proprietary information or make the system more vulnerable. This work shows that such attributes of neural networks can be exposed from a sequence of queries. This has multiple implications. On the one hand, our work exposes the vulnerability of black-box neural networks to different types of attacks -- we show that the revealed internal information helps generate more effective adversarial examples against the black box model. On the other hand, this technique can be used for better protection of private content from automatic recognition models using adversarial examples. Our paper suggests that it is actually hard to draw a line between white box and black box models.", "keywords": "black box;security;privacy;attack;metamodel;adversarial example;reverse-engineering;machine learning", "primary_area": "", "supplementary_material": "", "author": "Seong Joon Oh;Max Augustin;Mario Fritz;Bernt Schiele", "authorids": "joon@mpi-inf.mpg.de;maxaug@mpi-inf.mpg.de;mfritz@mpi-inf.mpg.de;schiele@mpi-inf.mpg.de", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\njoon2018towards,\ntitle={Towards Reverse-Engineering Black-Box Neural Networks},\nauthor={Seong Joon Oh and Max Augustin and Mario Fritz and Bernt Schiele},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BydjJte0-},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=BydjJte0-)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;7;7", "confidence": "4;3;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 490, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13048183228314164595&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 17, "openreview": "https://openreview.net/forum?id=BydjJte0-", "pdf": "https://openreview.net/pdf?id=BydjJte0-", "email": ";;;", "author_num": 4 }, { "title": "Proximal Backpropagation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/202", "id": "ByeqORgAW", "author_site": "Thomas Frerix, Thomas M\u00f6llenhoff, Michael Moeller, Daniel Cremers", "tldr": "", "abstract": "We propose proximal backpropagation (ProxProp) as a novel algorithm that takes implicit instead of explicit gradient steps to update the network parameters during neural network training. Our algorithm is motivated by the step size limitation of explicit gradient descent, which poses an impediment for optimization. ProxProp is developed from a general point of view on the backpropagation algorithm, currently the most common technique to train neural networks via stochastic gradient descent and variants thereof. Specifically, we show that backpropagation of a prediction error is equivalent to sequential gradient descent steps on a quadratic penalty energy, which comprises the network activations as variables of the optimization. We further analyze theoretical properties of ProxProp and in particular prove that the algorithm yields a descent direction in parameter space and can therefore be combined with a wide variety of convergent algorithms. Finally, we devise an efficient numerical implementation that integrates well with popular deep learning frameworks. We conclude by demonstrating promising numerical results and show that ProxProp can be effectively combined with common first order optimizers such as Adam.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Thomas Frerix;Thomas M\u00f6llenhoff;Michael Moeller;Daniel Cremers", "authorids": "thomas.frerix@tum.de;thomas.moellenhoff@in.tum.de;michael.moeller@uni-siegen.de;cremers@tum.de", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nfrerix2018proximal,\ntitle={Proximal Backpropagation},\nauthor={Thomas Frerix and Thomas M\u00f6llenhoff and Michael Moeller and Daniel Cremers},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ByeqORgAW},\n}", "github": "[![github](/images/github_icon.svg) tfrerix/proxprop](https://github.com/tfrerix/proxprop)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13919472914722495778&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=ByeqORgAW", "pdf": "https://openreview.net/pdf?id=ByeqORgAW", "email": ";;;", "author_num": 4 }, { "id": "BygpQlbA-", "title": "Towards Provable Control for Unknown Linear Dynamical Systems", "track": "main", "status": "Workshop", "tldr": "Using a novel representation of symmetric linear dynamical systems with a latent state, we formulate optimal control as a convex program, giving the first polynomial-time algorithm that solves optimal control with sample complexity only polylogarithmic in the time horizon.", "abstract": "We study the control of symmetric linear dynamical systems with unknown dynamics and a hidden state. Using a recent spectral filtering technique for concisely representing such systems in a linear basis, we formulate optimal control in this setting as a convex program. This approach eliminates the need to solve the non-convex problem of explicit identification of the system and its latent state, and allows for provable optimality guarantees for the control signal. We give the first efficient algorithm for finding the optimal control signal with an arbitrary time horizon T, with sample complexity (number of training rollouts) polynomial only in log(T) and other relevant parameters.", "keywords": "optimal control;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Sanjeev Arora;Elad Hazan;Holden Lee;Karan Singh;Cyril Zhang;Yi Zhang", "authorids": "arora@cs.princeton.edu;ehazan@cs.princeton.edu;holdenl@princeton.edu;karans@cs.princeton.edu;cyril.zhang@cs.princeton.edu;y.zhang@cs.princeton.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\narora2018towards,\ntitle={Towards Provable Control for Unknown Linear Dynamical Systems},\nauthor={Sanjeev Arora and Elad Hazan and Holden Lee and Karan Singh and Cyril Zhang and Yi Zhang},\nyear={2018},\nurl={https://openreview.net/forum?id=BygpQlbA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BygpQlbA-", "pdf_size": 0, "rating": "4;5;7", "confidence": "3;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 8, "authors#_avg": 6, "corr_rating_confidence": -0.18898223650461363, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9814362943393714119&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "Byht0GbRZ", "title": "STRUCTURED ALIGNMENT NETWORKS", "track": "main", "status": "Reject", "tldr": "Matching sentences by learning the latent constituency tree structures with a variant of the inside-outside algorithm embedded as a neural network layer.", "abstract": " Many tasks in natural language processing involve comparing two sentences to compute some notion of relevance, entailment, or similarity. Typically this comparison is done either at the word level or at the sentence level, with no attempt to leverage the inherent structure of the sentence. When sentence structure is used for comparison, it is obtained during a non-differentiable pre-processing step, leading to propagation of errors. We introduce a model of structured alignments between sentences, showing how to compare two sentences by matching their latent structures. Using a structured attention mechanism, our model matches possible spans in the first sentence to possible spans in the second sentence, simultaneously discovering the tree structure of each sentence and performing a comparison, in a model that is fully differentiable and is trained only on the comparison objective. We evaluate this model on two sentence comparison tasks: the Stanford natural language inference dataset and the TREC-QA dataset. We find that comparing spans results in superior performance to comparing words individually, and that the learned trees are consistent with actual linguistic structures.", "keywords": "structured attention;sentence matching", "primary_area": "", "supplementary_material": "", "author": "Yang Liu;Matt Gardner", "authorids": "yang.liu2@ed.ac.uk;mattg@allenai.org", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nliu2018structured,\ntitle={{STRUCTURED} {ALIGNMENT} {NETWORKS}},\nauthor={Yang Liu and Matt Gardner},\nyear={2018},\nurl={https://openreview.net/forum?id=Byht0GbRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Byht0GbRZ", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3232069488113210396&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "ByhthReRb", "title": "A Neural Method for Goal-Oriented Dialog Systems to interact with Named Entities", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many goal-oriented dialog tasks, especially ones in which the dialog system has to interact with external knowledge sources such as databases, have to handle a large number of Named Entities (NEs). There are at least two challenges in handling NEs using neural methods in such settings: individual NEs may occur only rarely making it hard to learn good representations of them, and many of the Out Of Vocabulary words that occur during test time may be NEs. Thus, the need to interact well with these NEs has emerged as a serious challenge to building neural methods for goal-oriented dialog tasks. In this paper, we propose a new neural method for this problem, and present empirical evaluations on a structured Question answering task and three related goal-oriented dialog tasks that show that our proposed method can be effective in interacting with NEs in these settings.", "keywords": "Named Entities;Neural methods;Goal oriented dialog", "primary_area": "", "supplementary_material": "", "author": "Janarthanan Rajendran;Jatin Ganhotra;Xiaoxiao Guo;Mo Yu;Satinder Singh", "authorids": "rjana@umich.edu;jatinganhotra@us.ibm.com;xiaoxiao.guo@ibm.com;yum@us.ibm.com;baveja@umich.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nrajendran2018a,\ntitle={A Neural Method for Goal-Oriented Dialog Systems to interact with Named Entities},\nauthor={Janarthanan Rajendran and Jatin Ganhotra and Xiaoxiao Guo and Mo Yu and Satinder Singh},\nyear={2018},\nurl={https://openreview.net/forum?id=ByhthReRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ByhthReRb", "pdf_size": 0, "rating": "3;4;6", "confidence": "3;3;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.0, "replies_avg": 8, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:wCAHnv2bL1QJ:scholar.google.com/&scioq=A+Neural+Method+for+Goal-Oriented+Dialog+Systems+to+interact+with+Named+Entities&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "Byj54-bAW", "title": "A Tensor Analysis on Dense Connectivity via Convolutional Arithmetic Circuits", "track": "main", "status": "Reject", "tldr": "We analyze the expressive power of the connections used in DenseNets via tensor decompositions.", "abstract": "Several state of the art convolutional networks rely on inter-connecting different layers to ease the flow of information and gradient between their input and output layers. These techniques have enabled practitioners to successfully train deep convolutional networks with hundreds of layers. Particularly, a novel way of interconnecting layers was introduced as the Dense Convolutional Network (DenseNet) and has achieved state of the art performance on relevant image recognition tasks. Despite their notable empirical success, their theoretical understanding is still limited. In this work, we address this problem by analyzing the effect of layer interconnection on the overall expressive power of a convolutional network. In particular, the connections used in DenseNet are compared with other types of inter-layer connectivity. We carry out a tensor analysis on the expressive power inter-connections on convolutional arithmetic circuits (ConvACs) and relate our results to standard convolutional networks. The analysis leads to performance bounds and practical guidelines for design of ConvACs. The generalization of these results are discussed for other kinds of convolutional networks via generalized tensor decompositions.", "keywords": "DenseNets;Tensor Analysis;Convolutional Arithmetic Circuits", "primary_area": "", "supplementary_material": "", "author": "Emilio Rafael Balda;Arash Behboodi;Rudolf Mathar", "authorids": "emilio.balda@ti.rwth-aachen.de;arash.behboodi@ti.rwth-aachen.de;mathar@ti.rwth-aachen.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nrafael2018a,\ntitle={A Tensor Analysis on Dense Connectivity via Convolutional Arithmetic Circuits},\nauthor={Emilio Rafael Balda and Arash Behboodi and Rudolf Mathar},\nyear={2018},\nurl={https://openreview.net/forum?id=Byj54-bAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Byj54-bAW", "pdf_size": 0, "rating": "4;4;5", "confidence": "3;3;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.0, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2996653860535395304&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "Byk4My-RZ", "title": "Flexible Prior Distributions for Deep Generative Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "We consider the problem of training generative models with deep neural networks as generators, i.e. to map latent codes to data points. Whereas the dominant paradigm combines simple priors over codes with complex deterministic models,\nwe argue that it might be advantageous to use more flexible code distributions. We demonstrate how these distributions can be induced directly from the data. The benefits include: more powerful generative models, better modeling of latent\nstructure and explicit control of the degree of generalization.", "keywords": "Deep Generative Models;GANs", "primary_area": "", "supplementary_material": "", "author": "Yannic Kilcher;Aurelien Lucchi;Thomas Hofmann", "authorids": "yannic.kilcher@inf.ethz.ch;aurelien.lucchi@inf.ethz.ch;thomas.hofmann@inf.ethz.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nkilcher2018flexible,\ntitle={Flexible Prior Distributions for Deep Generative Models},\nauthor={Yannic Kilcher and Aurelien Lucchi and Thomas Hofmann},\nyear={2018},\nurl={https://openreview.net/forum?id=Byk4My-RZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Byk4My-RZ", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3366669129844001798&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "BykJlIAbM", "title": "A cluster-to-cluster framework for neural machine translation", "track": "main", "status": "Withdraw", "tldr": "We invent a novel cluster-to-cluster framework for NMT training, which can better understand the both source and target language diversity.", "abstract": "The quality of a machine translation system depends largely on the availability of sizable parallel corpora. For the recently popular Neural Machine Translation (NMT) framework, data sparsity problem can become even more severe. With large amount of tunable parameters, the NMT model may overfit to the existing language pairs while failing to understand the general diversity in language. In this paper, we advocate to broadcast every sentence pair as two groups of similar sentences to incorporate more diversity in language expressions, which we name as parallel cluster. Then we define a more general cluster-to-cluster correspondence score and train our model to maximize this score. Since direct maximization is difficult, we derive its lower-bound as our surrogate objective, which is found to generalize point-point Maximum Likelihood Estimation (MLE) and point-to-cluster Reward Augmented Maximum Likelihood (RAML) algorithms as special cases. Based on this novel objective function, we delineate four potential systems to realize our cluster-to-cluster framework and test their performances in three recognized translation tasks, each task with forward and reverse translation directions. In each of the six experiments, our proposed four parallel systems have consistently proved to outperform the MLE baseline, RL (Reinforcement Learning) and RAML systems significantly. Finally, we have performed case study to empirically analyze the strength of the cluster-to-cluster NMT framework. ", "keywords": "Natural Language Processing;Machine Translation;Deep Learning;Data Augmentation", "primary_area": "", "supplementary_material": "", "author": "Anonymous", "authorids": "ICLR.cc/2018/Conference/Paper150/Authors", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@article{\n anonymous2018a,\n title={A cluster-to-cluster framework for neural machine translation},\n author={Anonymous},\n journal={International Conference on Learning Representations},\n year={2018},\n url={https://openreview.net/forum?id=ByWBpcJAZ}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BykJlIAbM", "pdf_size": 0, "rating": "3;5;6", "confidence": "4;2;3", "rating_avg": 4.666666666666667, "confidence_avg": 3.0, "replies_avg": 3, "authors#_avg": 1, "corr_rating_confidence": -0.6546536707079772, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "Bym0cU1CZ", "title": "Towards Interpretable Chit-chat: Open Domain Dialogue Generation with Dialogue Acts", "track": "main", "status": "Reject", "tldr": "open domain dialogue generation with dialogue acts", "abstract": "Conventional methods model open domain dialogue generation as a black box through end-to-end learning from large scale conversation data. In this work, we make the first step to open the black box by introducing dialogue acts into open domain dialogue generation. The dialogue acts are generally designed and reveal how people engage in social chat. Inspired by analysis on real data, we propose jointly modeling dialogue act selection and response generation, and perform learning with human-human conversations tagged with a dialogue act classifier and a reinforcement approach to further optimizing the model for long-term conversation. With the dialogue acts, we not only achieve significant improvement over state-of-the-art methods on response quality for given contexts and long-term conversation in both machine-machine simulation and human-machine conversation, but also are capable of explaining why such achievements can be made.", "keywords": "dialogue generation;dialogue acts;open domain conversation;supervised learning;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Wei Wu;Can Xu;Yu Wu;Zhoujun Li", "authorids": "wuwei@microsoft.com;can.xu@microsoft.com;wumark@126.com;lizj@buaa.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nwu2018towards,\ntitle={Towards Interpretable Chit-chat: Open Domain Dialogue Generation with Dialogue Acts},\nauthor={Wei Wu and Can Xu and Yu Wu and Zhoujun Li},\nyear={2018},\nurl={https://openreview.net/forum?id=Bym0cU1CZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=Bym0cU1CZ", "pdf_size": 0, "rating": "4;7;7", "confidence": "5;3;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844387, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3654077639939829204&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "Byni8NLHf", "title": "Learning Topics using Semantic Locality", "track": "main", "status": "Withdraw", "tldr": "We proposed a universal method which can be used in the data preprocessing stage to generate the more meaningful topic that better represents the given document", "abstract": "The topic modeling discovers the latent topic probability of given the text documents. To generate the more meaningful topic that better represents the given document, we proposed a universal method which can be used in the data preprocessing stage. The method consists of three steps. First, it generates the word/word-pair from every single document. Second, it applies a two way parallel TF-IDF algorithm to word/word-pair for semantic filtering. Third, it uses the k-means algorithm to merge the word pairs that have the similar semantic meaning.\n\nExperiments are carried out on the Open Movie Database (OMDb), Reuters Dataset and 20NewsGroup Dataset and use the mean Average Precision score as the evaluation metric. Comparing our results with other state-of-the-art topic models, such as Latent Dirichlet allocation and traditional Restricted Boltzmann Machines. Our proposed data preprocessing can improve the generated topic accuracy by up to 12.99\\%. How the number of clusters and the number of word pairs should be adjusted for different type of text document is also discussed.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ziyi Zhao;Krittaphat Pugdeethosapol;Sheng Lin;Zhe Li;Yanzhi Wang;Qinru Qiu", "authorids": "zzhao37@syr.edu;kpugdeet@syr.edu;shlin@syr.edu;zli89@syr.edu;ywang393@syr.edu;qiqiu@syr.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Byni8NLHf", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;5;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.333333333333333, "replies_avg": 3, "authors#_avg": 6, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1650766915723497146&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "BypdvewVM", "title": "Assessing the scalability of biologically-motivated deep learning algorithms and architectures", "track": "main", "status": "Withdraw", "tldr": "Benchmarks for biologically plausible learning algorithms on complex datasets and architectures", "abstract": "The backpropagation of error algorithm (BP) is often said to be impossible to implement in a real brain. The recent success of deep networks in machine learning and AI, however, has inspired a number of proposals for understanding how the brain might learn across multiple layers, and hence how it might implement or approximate BP. As of yet, none of these proposals have been rigorously evaluated on tasks where BP-guided deep learning has proved critical, or in architectures more structured than simple fully-connected networks. Here we present the first results on scaling up a biologically motivated model of deep learning to datasets which need deep networks with appropriate architectures to achieve good performance. We present results on CIFAR-10 and ImageNet. For CIFAR-10 we show that our algorithm, a straightforward, weight-transport-free variant of difference target-propagation (DTP) modified to remove backpropagation from the penultimate layer, is competitive with BP in training deep networks with locally defined receptive fields that have untied weights. For ImageNet we find that both DTP and our algorithm perform significantly worse than BP, opening questions about whether different architectures or algorithms are required to scale these approaches. Our results and implementation details help establish baselines for biologically motivated deep learning schemes going forward.", "keywords": "target propagation;biologically-plausible learning;benchmark;neuroscience", "primary_area": "", "supplementary_material": "", "author": "Sergey Bartunov;Adam Santoro;Blake A. Richards;Geoffrey E. Hinton;Timothy Lillicrap", "authorids": "bartunov@google.com;adamsantoro@google.com;blake.richards@utoronto.ca;geoffhinton@google.com;countzero@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BypdvewVM", "pdf_size": 0, "rating": "5;6;8", "confidence": "3;4;5", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 5, "corr_rating_confidence": 0.9819805060619659, "gs_citation": 329, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17862085183234087999&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "ByqFhGZCW", "title": "MACHINE VS MACHINE: MINIMAX-OPTIMAL DEFENSE AGAINST ADVERSARIAL EXAMPLES", "track": "main", "status": "Reject", "tldr": "A game-theoretic solution to adversarial attacks and defenses.", "abstract": "Recently, researchers have discovered that the state-of-the-art object classifiers can be fooled easily by small perturbations in the input unnoticeable to human eyes. It is known that an attacker can generate strong adversarial examples if she knows the classifier parameters. Conversely, a defender can robustify the classifier by retraining if she has the adversarial examples. \nThe cat-and-mouse game nature of attacks and defenses raises the question of the presence of equilibria in the dynamics.\nIn this paper, we present a neural-network based attack class to approximate a larger but intractable class of attacks, and \nformulate the attacker-defender interaction as a zero-sum leader-follower game. We present sensitivity-penalized optimization algorithms to find minimax solutions, which are the best worst-case defenses against whitebox attacks. Advantages of the learning-based attacks and defenses compared to gradient-based attacks and defenses are demonstrated with MNIST and CIFAR-10.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jihun Hamm", "authorids": "hammj@cse.ohio-state.edu", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nhamm2018machine,\ntitle={{MACHINE} {VS} {MACHINE}: {MINIMAX}-{OPTIMAL} {DEFENSE} {AGAINST} {ADVERSARIAL} {EXAMPLES}},\nauthor={Jihun Hamm},\nyear={2018},\nurl={https://openreview.net/forum?id=ByqFhGZCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ByqFhGZCW", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 12, "authors#_avg": 1, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11558094815393919511&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "ByquB-WC-", "title": "Finding ReMO (Related Memory Object): A Simple neural architecture for Text based Reasoning", "track": "main", "status": "Reject", "tldr": "A simple reasoning architecture based on the memory network (MemNN) and relation network (RN), reducing the time complexity compared to the RN and achieving state-of-the-are result on bAbI story based QA and bAbI dialog.", "abstract": "Memory Network based models have shown a remarkable progress on the task of relational reasoning.\nRecently, a simpler yet powerful neural network module called Relation Network (RN) has been introduced. \nDespite its architectural simplicity, the time complexity of relation network grows quadratically with data, hence limiting its application to tasks with a large-scaled memory.\nWe introduce Related Memory Network, an end-to-end neural network architecture exploiting both memory network and relation network structures. \nWe follow memory network's four components while each component operates similar to the relation network without taking a pair of objects. \nAs a result, our model is as simple as RN but the computational complexity is reduced to linear time.\nIt achieves the state-of-the-art results in jointly trained bAbI-10k story-based question answering and bAbI dialog dataset. ", "keywords": "Natural Language Processing;Deep Learning;Reasoning", "primary_area": "", "supplementary_material": "", "author": "Jihyung Moon;Hyochang Yang;Sungzoon Cho", "authorids": "jhmoon@dm.snu.ac.kr;hyochang@dm.snu.ac.kr;zoon@snu.ac.kr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmoon2018finding,\ntitle={Finding Re{MO} (Related Memory Object): A Simple neural architecture for Text based Reasoning},\nauthor={Jihyung Moon and Hyochang Yang and Sungzoon Cho},\nyear={2018},\nurl={https://openreview.net/forum?id=ByquB-WC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ByquB-WC-", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1177851689001766366&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Robustness of Classifiers to Universal Perturbations: A Geometric Perspective", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/286", "id": "ByrZyglCb", "author_site": "Seyed Mohsen Moosavi Dezfooli, Alhussein Fawzi, Omar Fawzi, Pascal Frossard, Stefano Soatto", "tldr": "Analysis of vulnerability of classifiers to universal perturbations and relation to the curvature of the decision boundary.", "abstract": "Deep networks have recently been shown to be vulnerable to universal perturbations: there exist very small image-agnostic perturbations that cause most natural images to be misclassified by such classifiers. In this paper, we provide a quantitative analysis of the robustness of classifiers to universal perturbations, and draw a formal link between the robustness to universal perturbations, and the geometry of the decision boundary. Specifically, we establish theoretical bounds on the robustness of classifiers under two decision boundary models (flat and curved models). We show in particular that the robustness of deep networks to universal perturbations is driven by a key property of their curvature: there exist shared directions along which the decision boundary of deep networks is systematically positively curved. Under such conditions, we prove the existence of small universal perturbations. Our analysis further provides a novel geometric method for computing universal perturbations, in addition to explaining their properties.", "keywords": "Universal perturbations;robustness;curvature", "primary_area": "", "supplementary_material": "", "author": "Seyed-Mohsen Moosavi-Dezfooli;Alhussein Fawzi;Omar Fawzi;Pascal Frossard;Stefano Soatto", "authorids": "seyed.moosavi@epfl.ch;fawzi@cs.ucla.edu;omar.fawzi@ens-lyon.fr;pascal.frossard@epfl.ch;soatto@cs.ucla.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nmoosavi-dezfooli2018robustness,\ntitle={Robustness of Classifiers to Universal Perturbations: A Geometric Perspective},\nauthor={Seyed-Mohsen Moosavi-Dezfooli and Alhussein Fawzi and Omar Fawzi and Pascal Frossard and Stefano Soatto},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ByrZyglCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;6;7", "confidence": "3;4;3", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 7, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 66, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11866393418232112272&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=ByrZyglCb", "pdf": "https://openreview.net/pdf?id=ByrZyglCb", "email": ";;;;", "author_num": 5 }, { "title": "Certified Defenses against Adversarial Examples", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/116", "id": "Bys4ob-Rb", "author_site": "Aditi Raghunathan, Jacob Steinhardt, Percy Liang", "tldr": "We demonstrate a certifiable, trainable, and scalable method for defending against adversarial examples.", "abstract": "While neural networks have achieved high accuracy on standard image classification benchmarks, their accuracy drops to nearly zero in the presence of small adversarial perturbations to test inputs. Defenses based on regularization and adversarial training have been proposed, but often followed by new, stronger attacks that defeat these defenses. Can we somehow end this arms race? In this work, we study this problem for neural networks with one hidden layer. We first propose a method based on a semidefinite relaxation that outputs a certificate that for a given network and test input, no attack can force the error to exceed a certain value. Second, as this certificate is differentiable, we jointly optimize it with the network parameters, providing an adaptive regularizer that encourages robustness against all attacks. On MNIST, our approach produces a network and a certificate that no that perturbs each pixel by at most $\\epsilon = 0.1$ can cause more than $35\\%$ test error.\n", "keywords": "adversarial examples;certificate of robustness;convex relaxations", "primary_area": "", "supplementary_material": "", "author": "Aditi Raghunathan;Jacob Steinhardt;Percy Liang", "authorids": "aditir@stanford.edu;jsteinhardt@cs.stanford.edu;pliang@cs.stanford.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nraghunathan2018certified,\ntitle={Certified Defenses against Adversarial Examples },\nauthor={Aditi Raghunathan and Jacob Steinhardt and Percy Liang},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Bys4ob-Rb},\n}", "github": "[worksheets/0xa21e7940](https://worksheets.codalab.org/worksheets/0xa21e794020bb474d8804ec7bc0543f52) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=Bys4ob-Rb)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "5;8;8", "confidence": "3;4;4", "rating_avg": 7.0, "confidence_avg": 3.6666666666666665, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": 1.0, "gs_citation": 1176, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17145877608540180848&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=Bys4ob-Rb", "pdf": "https://openreview.net/pdf?id=Bys4ob-Rb", "email": ";;", "author_num": 3 }, { "id": "Bys_NzbC-", "title": "Achieving Strong Regularization for Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "We investigate how and why strong L1/L2 regularization fails and propose a method than can achieve strong regularization.", "abstract": "L1 and L2 regularizers are critical tools in machine learning due to their ability to simplify solutions. However, imposing strong L1 or L2 regularization with gradient descent method easily fails, and this limits the generalization ability of the underlying neural networks. To understand this phenomenon, we investigate how and why training fails for strong regularization. Specifically, we examine how gradients change over time for different regularization strengths and provide an analysis why the gradients diminish so fast. We find that there exists a tolerance level of regularization strength, where the learning completely fails if the regularization strength goes beyond it. We propose a simple but novel method, Delayed Strong Regularization, in order to moderate the tolerance level. Experiment results show that our proposed approach indeed achieves strong regularization for both L1 and L2 regularizers and improves both accuracy and sparsity on public data sets. Our source code is published.", "keywords": "deep learning;regularization", "primary_area": "", "supplementary_material": "", "author": "Dae Hoon Park;Chiu Man Ho;Yi Chang", "authorids": "pdhvip@gmail.com;chiuman100@gmail.com;yi.chang@huawei.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nhoon2018achieving,\ntitle={Achieving Strong Regularization for Deep Neural Networks},\nauthor={Dae Hoon Park and Chiu Man Ho and Yi Chang},\nyear={2018},\nurl={https://openreview.net/forum?id=Bys_NzbC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Bys_NzbC-", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;5;2", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844387, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16676704753086970654&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "BysrYlP0-", "title": "Placeholder", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Placeholder ", "keywords": "Dialog Systems;Language Generation", "primary_area": "", "supplementary_material": "", "author": "Chongyang Tao;Shen Gao;Mingyue Shang;Rui Yan;Dongyan Zhao", "authorids": "chongyangtao@163.com;63388@qq.com;shangmy@pku.edu.cn;ruiyan@pku.edu.cn;zhaody@pku.edu.cn", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=BysrYlP0-", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 0, "authors#_avg": 5, "corr_rating_confidence": 0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10061603456447875443&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "title": "Learning Latent Permutations with Gumbel-Sinkhorn Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/183", "id": "Byt3oJ-0W", "author_site": "gonzalo mena, David Belanger, Scott Linderman, Jasper Snoek", "tldr": "A new method for gradient-descent inference of permutations, with applications to latent matching inference and supervised learning of permutations with neural networks", "abstract": "Permutations and matchings are core building blocks in a variety of latent variable models, as they allow us to align, canonicalize, and sort data. Learning in such models is difficult, however, because exact marginalization over these combinatorial objects is intractable. In response, this paper introduces a collection of new methods for end-to-end learning in such models that approximate discrete maximum-weight matching using the continuous Sinkhorn operator. Sinkhorn iteration is attractive because it functions as a simple, easy-to-implement analog of the softmax operator. With this, we can define the Gumbel-Sinkhorn method, an extension of the Gumbel-Softmax method (Jang et al. 2016, Maddison2016 et al. 2016) to distributions over latent matchings. We demonstrate the effectiveness of our method by outperforming competitive baselines on a range of qualitatively different tasks: sorting numbers, solving jigsaw puzzles, and identifying neural signals in worms. ", "keywords": "Permutation;Latent;Sinkhorn;Inference;Optimal Transport;Gumbel;Softmax;Sorting", "primary_area": "", "supplementary_material": "", "author": "Gonzalo Mena;David Belanger;Scott Linderman;Jasper Snoek", "authorids": "gem2131@columbia.edu;dbelanger@google.com;scott.linderman@gmail.com;jsnoek@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nmena2018learning,\ntitle={Learning Latent Permutations with Gumbel-Sinkhorn Networks},\nauthor={Gonzalo Mena and David Belanger and Scott Linderman and Jasper Snoek},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Byt3oJ-0W},\n}", "github": "[![github](/images/github_icon.svg) google/gumbel_sinkhorn](https://github.com/google/gumbel_sinkhorn) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=Byt3oJ-0W)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "2;4;4", "rating_avg": 7.0, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844387, "gs_citation": 307, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17995429437153045101&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=Byt3oJ-0W", "pdf": "https://openreview.net/pdf?id=Byt3oJ-0W", "email": ";;;", "author_num": 4 }, { "id": "ByuI-mW0W", "title": "Towards a Testable Notion of Generalization for Generative Adversarial Networks", "track": "main", "status": "Reject", "tldr": "Assess whether or not your GAN is actually doing something other than memorizing the training data.", "abstract": "We consider the question of how to assess generative adversarial networks, in particular with respect to whether or not they generalise beyond memorising the training data. We propose a simple procedure for assessing generative adversarial network performance based on a principled consideration of what the actual goal of generalisation is. Our approach involves using a test set to estimate the Wasserstein distance between the generative distribution produced by our procedure, and the underlying data distribution. We use this procedure to assess the performance of several modern generative adversarial network architectures. We find that this procedure is sensitive to the choice of ground metric on the underlying data space, and suggest a choice of ground metric that substantially improves performance. We finally suggest that attending to the ground metric used in Wasserstein generative adversarial network training may be fruitful, and outline a concrete pathway towards doing so.", "keywords": "generative adversarial networks;Wasserstein;GAN;generalization;theory", "primary_area": "", "supplementary_material": "", "author": "Robert Cornish;Hongseok Yang;Frank Wood", "authorids": "rcornish@robots.ox.ac.uk;hongseok.yang@cs.ox.ac.uk;fwood@robots.ox.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ncornish2018towards,\ntitle={Towards a Testable Notion of Generalization for Generative Adversarial Networks},\nauthor={Robert Cornish and Hongseok Yang and Frank Wood},\nyear={2018},\nurl={https://openreview.net/forum?id=ByuI-mW0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ByuI-mW0W", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;3;3", "rating_avg": 5.0, "confidence_avg": 3.3333333333333335, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10370072916053814092&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "ByuP8yZRb", "title": "Censoring Representations with Multiple-Adversaries over Random Subspaces", "track": "main", "status": "Reject", "tldr": "This paper improves the quality of the recently proposed adversarial feature leaning (AFL) approach for incorporating explicit constrains to representations, by introducing the concept of the {\\em vulnerableness} of the adversary. ", "abstract": "Adversarial feature learning (AFL) is one of the promising ways for explicitly constrains neural networks to learn desired representations; for example, AFL could help to learn anonymized representations so as to avoid privacy issues. AFL learn such a representations by training the networks to deceive the adversary that predict the sensitive information from the network, and therefore, the success of the AFL heavily relies on the choice of the adversary. This paper proposes a novel design of the adversary, {\\em multiple adversaries over random subspaces} (MARS) that instantiate the concept of the {\\em volunerableness}. The proposed method is motivated by an assumption that deceiving an adversary could fail to give meaningful information if the adversary is easily fooled, and adversary rely on single classifier suffer from this issues. \nIn contrast, the proposed method is designed to be less vulnerable, by utilizing the ensemble of independent classifiers where each classifier tries to predict sensitive variables from a different {\\em subset} of the representations. \nThe empirical validations on three user-anonymization tasks show that our proposed method achieves state-of-the-art performances in all three datasets without significantly harming the utility of data. \nThis is significant because it gives new implications about designing the adversary, which is important to improve the performance of AFL. ", "keywords": "Adversarial Training;Privacy Protection;Random Subspace", "primary_area": "", "supplementary_material": "", "author": "Yusuke Iwasawa;Kotaro Nakayama;Yutaka Matsuo", "authorids": "iwasawa@weblab.t.u-tokyo.ac.jp;nakayama@weblab.t.u-tokyo.ac.jp;matsuo@weblab.t.u-tokyo.ac.jp", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\niwasawa2018censoring,\ntitle={Censoring Representations with Multiple-Adversaries over Random Subspaces},\nauthor={Yusuke Iwasawa and Kotaro Nakayama and Yutaka Matsuo},\nyear={2018},\nurl={https://openreview.net/forum?id=ByuP8yZRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ByuP8yZRb", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;3;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EGXmodBzIjIJ:scholar.google.com/&scioq=Censoring+Representations+with+Multiple-Adversaries+over+Random+Subspaces&hl=en&as_sdt=0,5", "gs_version_total": 3 }, { "title": "Minimax Curriculum Learning: Machine Teaching with Desirable Difficulties and Scheduled Diversity", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/276", "id": "BywyFQlAW", "author_site": "Tianyi Zhou, Jeff Bilmes", "tldr": "Minimax Curriculum Learning is a machine teaching method involving increasing desirable hardness and scheduled reducing diversity.", "abstract": "We introduce and study minimax curriculum learning (MCL), a new method for adaptively selecting a sequence of training subsets for a succession of stages in machine learning. The subsets are encouraged to be small and diverse early on, and then larger, harder, and allowably more homogeneous in later stages. At each stage, model weights and training sets are chosen by solving a joint continuous-discrete minimax optimization, whose objective is composed of a continuous loss (reflecting training set hardness) and a discrete submodular promoter of diversity for the chosen subset. MCL repeatedly solves a sequence of such optimizations with a schedule of increasing training set size and decreasing pressure on diversity encouragement. We reduce MCL to the minimization of a surrogate function handled by submodular maximization and continuous gradient methods. We show that MCL achieves better performance and, with a clustering trick, uses fewer labeled samples for both shallow and deep models while achieving the same performance. Our method involves repeatedly solving constrained submodular maximization of an only slowly varying function on the same ground set. Therefore, we develop a heuristic method that utilizes the previous submodular maximization solution as a warm start for the current submodular maximization process to reduce computation while still yielding a guarantee.", "keywords": "machine teaching;deep learning;minimax;curriculum learning;submodular;diversity", "primary_area": "", "supplementary_material": "", "author": "Tianyi Zhou;Jeff Bilmes", "authorids": "tianyi.david.zhou@gmail.com;bilmes@uw.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nzhou2018scheduled,\ntitle={Minimax Curriculum Learning: Machine Teaching with Desirable Difficulties and Scheduled Diversity},\nauthor={Tianyi Zhou and Jeff Bilmes},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=BywyFQlAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;3;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 73, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15130630439374027502&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "openreview": "https://openreview.net/forum?id=BywyFQlAW", "pdf": "https://openreview.net/pdf?id=BywyFQlAW", "email": ";", "author_num": 2 }, { "id": "ByxLBMZCb", "title": "Learning Deep Models: Critical Points and Local Openness", "track": "main", "status": "Workshop", "tldr": "", "abstract": "With the increasing interest in deeper understanding of the loss surface of many non-convex deep models, this paper presents a unifying framework to study the local/global optima equivalence of the optimization problems arising from training of such non-convex models. Using the \"local openness\" property of the underlying training models, we provide simple sufficient conditions under which any local optimum of the resulting optimization problem is globally optimal. We first completely characterize the local openness of matrix multiplication mapping in its range. Then we use our characterization to: 1) show that every local optimum of two layer linear networks is globally optimal. Unlike many existing results in the literature, our result requires no assumption on the target data matrix Y, and input data matrix X. 2) develop almost complete characterization of the local/global optima equivalence of multi-layer linear neural networks. We provide various counterexamples to show the necessity of each of our assumptions. 3) show global/local optima equivalence of non-linear deep models having certain pyramidal structure. Unlike some existing works, our result requires no assumption on the differentiability of the activation functions and can go beyond \"full-rank\" cases. \n", "keywords": "Training Deep Models;Non-convex Optimization;Local and Global Equivalence;Local Openness", "primary_area": "", "supplementary_material": "", "author": "Maher Nouiehed;Meisam Razaviyayn", "authorids": "nouiehed@usc.edu;razaviya@usc.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nnouiehed2018learning,\ntitle={Learning Deep Models: Critical Points and Local Openness},\nauthor={Maher Nouiehed and Meisam Razaviyayn},\nyear={2018},\nurl={https://openreview.net/forum?id=ByxLBMZCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ByxLBMZCb", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=518693101108020479&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "ByzvHagA-", "title": "Disentangled activations in deep networks", "track": "main", "status": "Reject", "tldr": "We propose a novel regularization method that penalize covariance between dimensions of the hidden layers in a network.", "abstract": "Deep neural networks have been tremendously successful in a number of tasks.\nOne of the main reasons for this is their capability to automatically\nlearn representations of data in levels of abstraction,\nincreasingly disentangling the data as the internal transformations are applied.\nIn this paper we propose a novel regularization method that penalize covariance between dimensions of the hidden layers in a network, something that benefits the disentanglement.\nThis makes the network learn nonlinear representations that are linearly uncorrelated, yet allows the model to obtain good results on a number of tasks, as demonstrated by our experimental evaluation.\nThe proposed technique can be used to find the dimensionality of the underlying data, because it effectively disables dimensions that aren't needed.\nOur approach is simple and computationally cheap, as it can be applied as a regularizer to any gradient-based learning model.", "keywords": "representation learning;disentanglement;regularization", "primary_area": "", "supplementary_material": "", "author": "Mikael K\u00e5geb\u00e4ck;Olof Mogren", "authorids": "kageback@chalmers.se;olof@mogren.one", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nk\u00e5geb\u00e4ck2018disentangled,\ntitle={Disentangled activations in deep networks},\nauthor={Mikael K\u00e5geb\u00e4ck and Olof Mogren},\nyear={2018},\nurl={https://openreview.net/forum?id=ByzvHagA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ByzvHagA-", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;4;3", "rating_avg": 5.0, "confidence_avg": 3.3333333333333335, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9993698088478904912&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "H1-IBSgMz", "title": "A Matrix Approximation View of NCE that Justifies Self-Normalization", "track": "main", "status": "Withdraw", "tldr": "We prove that NCE is self-normalized and demonstrate it on datasets", "abstract": "Self-normalizing discriminative models approximate the normalized probability of a class without having to compute the partition function. This property is useful to computationally-intensive neural network classifiers, as the cost of computing the partition function grows linearly with the number of classes and may become prohibitive. In particular, since neural language models may deal with up to millions of classes, their self-normalization properties received notable attention. Several\nrecent studies empirically found that language models, trained using Noise Contrastive Estimation (NCE), exhibit self-normalization, but could not explain why. In this study, we provide a theoretical justification to this property by viewing\nNCE as a low-rank matrix approximation. Our empirical investigation compares NCE to the alternative explicit approach for self-normalizing language models. It also uncovers a surprising negative correlation between self-normalization and\nperplexity, as well as some regularity in the observed errors that may potentially be used for improving self-normalization algorithms in the future.", "keywords": "language modeling;NCE;self-normalization", "primary_area": "", "supplementary_material": "", "author": "Jacob Goldberger;Oren Melamud", "authorids": "jacob.goldberger@biu.ac.il;oren@melamuds.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=H1-IBSgMz", "pdf_size": 0, "rating": "2;3;6", "confidence": "4;5;3", "rating_avg": 3.6666666666666665, "confidence_avg": 4.0, "replies_avg": 3, "authors#_avg": 2, "corr_rating_confidence": -0.720576692122892, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:AvWUpxJpnhgJ:scholar.google.com/&scioq=A+Matrix+Approximation+View+of+NCE+that+Justifies+Self-Normalization&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Gaussian Process Behaviour in Wide Deep Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/161", "id": "H1-nGgWC-", "author_site": "Alexander Matthews, Jiri Hron, Mark Rowland, Richard E Turner, Zoubin Ghahramani", "tldr": "", "abstract": "Whilst deep neural networks have shown great empirical success, there is still much work to be done to understand their theoretical properties. In this paper, we study the relationship between Gaussian processes with a recursive kernel definition and random wide fully connected feedforward networks with more than one hidden layer. We exhibit limiting procedures under which finite deep networks will converge in distribution to the corresponding Gaussian process. To evaluate convergence rates empirically, we use maximum mean discrepancy. We then exhibit situations where existing Bayesian deep networks are close to Gaussian processes in terms of the key quantities of interest. Any Gaussian process has a flat representation. Since this behaviour may be undesirable in certain situations we discuss ways in which it might be prevented.", "keywords": "Gaussian Processes;Bayesian Deep Learning;Theory of Deep Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Alexander G. de G. Matthews;Jiri Hron;Mark Rowland;Richard E. Turner;Zoubin Ghahramani", "authorids": "am554@cam.ac.uk;jh2084@cam.ac.uk;mr504@cam.ac.uk;ret26@cam.ac.uk;zoubin@eng.cam.ac.uk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ng.2018gaussian,\ntitle={Gaussian Process Behaviour in Wide Deep Neural Networks},\nauthor={Alexander G. de G. Matthews and Jiri Hron and Mark Rowland and Richard E. Turner and Zoubin Ghahramani},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1-nGgWC-},\n}", "github": "[![github](/images/github_icon.svg) widedeepnetworks/widedeepnetworks](https://github.com/widedeepnetworks/widedeepnetworks) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=H1-nGgWC-)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 483, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14179398766282481068&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=H1-nGgWC-", "pdf": "https://openreview.net/pdf?id=H1-nGgWC-", "email": ";;;;", "author_num": 5 }, { "id": "H1-oTz-Cb", "title": "Parametrizing filters of a CNN with a GAN", "track": "main", "status": "Reject", "tldr": "", "abstract": "It is commonly agreed that the use of relevant invariances as a good statistical bias is important in machine-learning. However, most approaches that explicitely incorporate invariances into a model architecture only make use of very simple transformations, such as translations and rotations. Hence, there is a need for methods to model and extract richer transformations that capture much higher-level invariances. To that end, we introduce a tool allowing to parametrize the set of filters of a trained convolutional neural network with the latent space of a generative adversarial network. We then show that the method can capture highly non-linear invariances of the data by visualizing their effect in the data space.", "keywords": "invariance;cnn;gan;infogan;transformation", "primary_area": "", "supplementary_material": "", "author": "Yannic Kilcher;Gary Becigneul;Thomas Hofmann", "authorids": "yannic.kilcher@inf.ethz.ch;gary.becigneul@inf.ethz.ch;thomas.hofmann@inf.ethz.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nkilcher2018parametrizing,\ntitle={Parametrizing filters of a {CNN} with a {GAN}},\nauthor={Yannic Kilcher and Gary Becigneul and Thomas Hofmann},\nyear={2018},\nurl={https://openreview.net/forum?id=H1-oTz-Cb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=H1-oTz-Cb", "pdf_size": 0, "rating": "2;4;4", "confidence": "4;4;5", "rating_avg": 3.3333333333333335, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1403365979136695361&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "H113pWZRb", "title": "Topology Adaptive Graph Convolutional Networks", "track": "main", "status": "Reject", "tldr": "Low computational complexity graph CNN (without approximation) with better classification accuracy", "abstract": "Convolution acts as a local feature extractor in convolutional neural networks (CNNs). However, the convolution operation is not applicable when the input data is supported on an irregular graph such as with social networks, citation networks, or knowledge graphs. This paper proposes the topology adaptive graph convolutional network (TAGCN), a novel graph convolutional network that generalizes CNN architectures to graph-structured data and provides a systematic way to design a set of fixed-size learnable filters to perform convolutions on graphs. The topologies of these filters are adaptive to the topology of the graph when they scan the graph to perform convolution, replacing the square filter for the grid-structured data in traditional CNNs. The outputs are the weighted sum of these filters\u2019 outputs, extraction of both vertex features and strength of correlation between vertices. It\ncan be used with both directed and undirected graphs. The proposed TAGCN not only inherits the properties of convolutions in CNN for grid-structured data, but it is also consistent with convolution as defined in graph signal processing. Further, as no approximation to the convolution is needed, TAGCN exhibits better performance than existing graph-convolution-approximation methods on a number\nof data sets. As only the polynomials of degree two of the adjacency matrix are used, TAGCN is also computationally simpler than other recent methods.", "keywords": "graph convolutional neural networks;graph-structured data;semi-classification", "primary_area": "", "supplementary_material": "", "author": "Jian Du;Shanghang Zhang;Guanhang Wu;Jos\u00e9 M. F. Moura;Soummya Kar", "authorids": "jiand@andrew.cmu.edu;shanghaz@andrew.cmu.edu;guanhanw@andrew.cmu.edu;moura@andrew.cmu.edu;soummyak@andrew.cmu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ndu2018topology,\ntitle={Topology Adaptive Graph Convolutional Networks},\nauthor={Jian Du and Shanghang Zhang and Guanhang Wu and Jos\u00e9 M. F. Moura and Soummya Kar},\nyear={2018},\nurl={https://openreview.net/forum?id=H113pWZRb},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=H113pWZRb)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=H113pWZRb", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;3", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 14, "authors#_avg": 5, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 423, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3144784708041904220&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "H11lAfbCW", "title": "On Characterizing the Capacity of Neural Networks Using Algebraic Topology", "track": "main", "status": "Reject", "tldr": "We show that the learnability of different neural architectures can be characterized directly by computable measures of data complexity.", "abstract": "The learnability of different neural architectures can be characterized directly by computable measures of data complexity. In this paper, we reframe the problem of architecture selection as understanding how data determines the most expressive and generalizable architectures suited to that data, beyond inductive bias. After suggesting algebraic topology as a measure for data complexity, we show that the power of a network to express the topological complexity of a dataset in its decision boundary is a strictly limiting factor in its ability to generalize. We then provide the first empirical characterization of the topological capacity of neural networks. Our empirical analysis shows that at every level of dataset complexity, neural networks exhibit topological phase transitions and stratification. This observation allowed us to connect existing theory to empirically driven conjectures on the choice of architectures for a single hidden layer neural networks. ", "keywords": "deep learning theory;architecture selection;algebraic topology", "primary_area": "", "supplementary_material": "", "author": "William H. Guss;Ruslan Salakhutdinov", "authorids": "wguss@cs.cmu.edu;rsalakhu@cs.cmu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nh.2018on,\ntitle={On Characterizing the Capacity of Neural Networks Using Algebraic Topology},\nauthor={William H. Guss and Ruslan Salakhutdinov},\nyear={2018},\nurl={https://openreview.net/forum?id=H11lAfbCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=H11lAfbCW", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;5;5", "rating_avg": 3.6666666666666665, "confidence_avg": 5.0, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 107, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14805366423738847081&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "title": "Mixed Precision Training of Convolutional Neural Networks using Integer Operations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/52", "id": "H135uzZ0-", "author_site": "Dipankar Das, Naveen Mellempudi, Dheevatsa Mudigere, Dhiraj Kalamkar, Sasikanth Avancha, Kunal Banerjee, Srinivas Sridharan, Karthik Vaidyanathan, Bharat Kaul, Evangelos Georganas, Alexander Heinecke, Pradeep K Dubey, Jesus Corbal, Nikita Shustrov, Roma Dubtsov, Evarist Fomenko, Vadim Pirogov", "tldr": "Mixed precision training pipeline using 16-bit integers on general purpose HW; SOTA accuracy for ImageNet-class CNNs; Best reported accuracy for ImageNet-1K classification task with any reduced precision training;", "abstract": "The state-of-the-art (SOTA) for mixed precision training is dominated by variants of low precision floating point operations, and in particular, FP16 accumulating into FP32 Micikevicius et al. (2017). On the other hand, while a lot of research has also happened in the domain of low and mixed-precision Integer training, these works either present results for non-SOTA networks (for instance only AlexNet for ImageNet-1K), or relatively small datasets (like CIFAR-10). In this work, we train state-of-the-art visual understanding neural networks on the ImageNet-1K dataset, with Integer operations on General Purpose (GP) hardware. In particular, we focus on Integer Fused-Multiply-and-Accumulate (FMA) operations which take two pairs of INT16 operands and accumulate results into an INT32 output.We propose a shared exponent representation of tensors and develop a Dynamic Fixed Point (DFP) scheme suitable for common neural network operations. The nuances of developing an efficient integer convolution kernel is examined, including methods to handle overflow of the INT32 accumulator. We implement CNN training for ResNet-50, GoogLeNet-v1, VGG-16 and AlexNet; and these networks achieve or exceed SOTA accuracy within the same number of iterations as their FP32 counterparts without any change in hyper-parameters and with a 1.8X improvement in end-to-end training throughput. To the best of our knowledge these results represent the first INT16 training results on GP hardware for ImageNet-1K dataset using SOTA CNNs and achieve highest reported accuracy using half precision ", "keywords": "deep learning training;reduced precision;imagenet;dynamic fixed point", "primary_area": "", "supplementary_material": "", "author": "Dipankar Das;Naveen Mellempudi;Dheevatsa Mudigere;Dhiraj Kalamkar;Sasikanth Avancha;Kunal Banerjee;Srinivas Sridharan;Karthik Vaidyanathan;Bharat Kaul;Evangelos Georganas;Alexander Heinecke;Pradeep Dubey;Jesus Corbal;Nikita Shustrov;Roma Dubtsov;Evarist Fomenko;Vadim Pirogov", "authorids": "dipankar.das@intel.com;naveen.k.mellempudi@intel.com;dheevatsa.mudigere@intel.com;dhiraj.d.kalamkar@intel.com;sasikanth.avancha@intel.com;kunal.banerjee@intel.com;srinivas.sridharan@intel.com;karthikeyan.vaidyanathan@intel.com;bharat.kaul@intel.com;evangelos.georganas@intel.com;alexander.heinecke@intel.com;pradeep.dubey@intel.com;jesus.corbal@intel.com;nikita.a.shustrov@intel.com;roman.s.dubtsov@intel.com;evarist.m.fomenko@intel.com;vadim.o.pirogov@intel.com", "gender": ";;;;;;;;;;;;;;;;", "homepage": ";;;;;;;;;;;;;;;;", "dblp": ";;;;;;;;;;;;;;;;", "google_scholar": ";;;;;;;;;;;;;;;;", "orcid": ";;;;;;;;;;;;;;;;", "linkedin": ";;;;;;;;;;;;;;;;", "or_profile": ";;;;;;;;;;;;;;;;", "aff": ";;;;;;;;;;;;;;;;", "aff_domain": ";;;;;;;;;;;;;;;;", "position": ";;;;;;;;;;;;;;;;", "bibtex": "@inproceedings{\ndas2018mixed,\ntitle={Mixed Precision Training of Convolutional Neural Networks using Integer Operations},\nauthor={Dipankar Das and Naveen Mellempudi and Dheevatsa Mudigere and Dhiraj Kalamkar and Sasikanth Avancha and Kunal Banerjee and Srinivas Sridharan and Karthik Vaidyanathan and Bharat Kaul and Evangelos Georganas and Alexander Heinecke and Pradeep Dubey and Jesus Corbal and Nikita Shustrov and Roma Dubtsov and Evarist Fomenko and Vadim Pirogov},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H135uzZ0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;4;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 8, "authors#_avg": 17, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 226, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2511515363011215237&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=H135uzZ0-", "pdf": "https://openreview.net/pdf?id=H135uzZ0-", "email": ";;;;;;;;;;;;;;;;", "author_num": 17 }, { "id": "H139Q_gAW", "title": "Learning Graph Convolution Filters from Data Manifold", "track": "main", "status": "Reject", "tldr": "We devise a novel Depthwise Separable Graph Convolution (DSGC) for the generic spatial domain data, which is highly compatible with depthwise separable convolution.", "abstract": "Convolution Neural Network (CNN) has gained tremendous success in computer vision tasks with its outstanding ability to capture the local latent features. Recently, there has been an increasing interest in extending CNNs to the general spatial domain. Although various types of graph convolution and geometric convolution methods have been proposed, their connections to traditional 2D-convolution are not well-understood. In this paper, we show that depthwise separable convolution is a path to unify the two kinds of convolution methods in one mathematical view, based on which we derive a novel Depthwise Separable Graph Convolution that subsumes existing graph convolution methods as special cases of our formulation. Experiments show that the proposed approach consistently outperforms other graph convolution and geometric convolution baselines on benchmark datasets in multiple domains.", "keywords": "Label Propagation;Depthwise separable convolution;Graph and geometric convolution", "primary_area": "", "supplementary_material": "", "author": "Guokun Lai;Hanxiao Liu;Yiming Yang", "authorids": "guokun@cs.cmu.edu;hanxiaol@cs.cmu.edu;yiming@cs.cmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlai2018learning,\ntitle={Learning Graph Convolution Filters from Data Manifold},\nauthor={Guokun Lai and Hanxiao Liu and Yiming Yang},\nyear={2018},\nurl={https://openreview.net/forum?id=H139Q_gAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=H139Q_gAW", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;3;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": -0.5, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1670113646808321018&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "H13WofbAb", "title": "Faster Distributed Synchronous SGD with Weak Synchronization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Distributed training of deep learning is widely conducted with large neural networks and large datasets. Besides asynchronous stochastic gradient descent~(SGD), synchronous SGD is a reasonable alternative with better convergence guarantees. However, synchronous SGD suffers from stragglers. To make things worse, although there are some strategies dealing with slow workers, the issue of slow servers is commonly ignored. In this paper, we propose a new parameter server~(PS) framework dealing with not only slow workers, but also slow servers by weakening the synchronization criterion. The empirical results show good performance when there are stragglers.", "keywords": "distributed;deep learning;straggler", "primary_area": "", "supplementary_material": "", "author": "Cong Xie;Oluwasanmi O. Koyejo;Indranil Gupta", "authorids": "cx2@illinois.edu;sanmi@illinois.edu;indy@illinois.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nxie2018faster,\ntitle={Faster Distributed Synchronous {SGD} with Weak Synchronization},\nauthor={Cong Xie and Oluwasanmi O. Koyejo and Indranil Gupta},\nyear={2018},\nurl={https://openreview.net/forum?id=H13WofbAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=H13WofbAb", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;5;5", "rating_avg": 3.6666666666666665, "confidence_avg": 4.666666666666667, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13880553374870191998&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "H15RufWAW", "title": "GraphGAN: Generating Graphs via Random Walks", "track": "main", "status": "Reject", "tldr": "Using GANs to generate graphs via random walks.", "abstract": "We propose GraphGAN - the first implicit generative model for graphs that enables to mimic real-world networks.\nWe pose the problem of graph generation as learning the distribution of biased random walks over a single input graph.\nOur model is based on a stochastic neural network that generates discrete output samples, and is trained using the Wasserstein GAN objective. GraphGAN enables us to generate sibling graphs, which have similar properties yet are not exact replicas of the original graph. Moreover, GraphGAN learns a semantic mapping from the latent input space to the generated graph's properties. We discover that sampling from certain regions of the latent space leads to varying properties of the output graphs, with smooth transitions between them. Strong generalization properties of GraphGAN are highlighted by its competitive performance in link prediction as well as promising results on node classification, even though not specifically trained for these tasks.", "keywords": "GAN;graphs;random walks;implicit generative models", "primary_area": "", "supplementary_material": "", "author": "Aleksandar Bojchevski;Oleksandr Shchur;Daniel Z\u00fcgner;Stephan G\u00fcnnemann", "authorids": "a.bojchevski@in.tum.de;shchur@in.tum.de;daniel.zuegner@gmail.com;guennemann@in.tum.de", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbojchevski2018graphgan,\ntitle={Graph{GAN}: Generating Graphs via Random Walks},\nauthor={Aleksandar Bojchevski and Oleksandr Shchur and Daniel Z\u00fcgner and Stephan G\u00fcnnemann},\nyear={2018},\nurl={https://openreview.net/forum?id=H15RufWAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=H15RufWAW", "pdf_size": 0, "rating": "4;6;7", "confidence": "5;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 20, "authors#_avg": 4, "corr_rating_confidence": -0.944911182523068, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8717092627985280861&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Semantic Interpolation in Implicit Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/117", "id": "H15odZ-C-", "author_site": "Yannic Kilcher, Aurelien Lucchi, Thomas Hofmann", "tldr": "", "abstract": "In implicit models, one often interpolates between sampled points in latent space. As we show in this paper, care needs to be taken to match-up the distributional assumptions on code vectors with the geometry of the interpolating paths. Otherwise, typical assumptions about the quality and semantics of in-between points may not be justified. Based on our analysis we propose to modify the prior code distribution to put significantly more probability mass closer to the origin. As a result, linear interpolation paths are not only shortest paths, but they are also guaranteed to pass through high-density regions, irrespective of the dimensionality of the latent space. Experiments on standard benchmark image datasets demonstrate clear visual improvements in the quality of the generated samples and exhibit more meaningful interpolation paths.", "keywords": "Deep Generative Models;GANs", "primary_area": "", "supplementary_material": "", "author": "Yannic Kilcher;Aurelien Lucchi;Thomas Hofmann", "authorids": "yannic.kilcher@inf.ethz.ch;aurelien.lucchi@inf.ethz.ch;thomas.hofmann@inf.ethz.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nkilcher2018semantic,\ntitle={Semantic Interpolation in Implicit Models},\nauthor={Yannic Kilcher and Aurelien Lucchi and Thomas Hofmann},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H15odZ-C-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;3;4", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11943550239831757729&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=H15odZ-C-", "pdf": "https://openreview.net/pdf?id=H15odZ-C-", "email": ";;", "author_num": 3 }, { "id": "H18WqugAb", "title": "Still not systematic after all these years: On the compositional skills of sequence-to-sequence recurrent networks", "track": "main", "status": "Workshop", "tldr": "Using a simple language-driven navigation task, we study the compositional capabilities of modern seq2seq recurrent networks.", "abstract": "Humans can understand and produce new utterances effortlessly, thanks to their systematic compositional skills. Once a person learns the meaning of a new verb \"dax,\" he or she can immediately understand the meaning of \"dax twice\" or \"sing and dax.\" In this paper, we introduce the SCAN domain, consisting of a set of simple compositional navigation commands paired with the corresponding action sequences. We then test the zero-shot generalization capabilities of a variety of recurrent neural networks (RNNs) trained on SCAN with sequence-to-sequence methods. We find that RNNs can generalize well when the differences between training and test commands are small, so that they can apply \"mix-and-match\" strategies to solve the task. However, when generalization requires systematic compositional skills (as in the \"dax\" example above), RNNs fail spectacularly. We conclude with a proof-of-concept experiment in neural machine translation, supporting the conjecture that lack of systematicity is an important factor explaining why neural networks need very large training sets.", "keywords": "sequence-to-sequence recurrent networks;compositionality;systematicity;generalization;language-driven navigation", "primary_area": "", "supplementary_material": "", "author": "Brenden Lake;Marco Baroni", "authorids": "brenden@nyu.edu;marco.baroni@unitn.it", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlake2018still,\ntitle={Still not systematic after all these years: On the compositional skills of sequence-to-sequence recurrent networks},\nauthor={Brenden Lake and Marco Baroni},\nyear={2018},\nurl={https://openreview.net/forum?id=H18WqugAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=H18WqugAb", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;5;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 84, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2251211046909792457&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "H18uzzWAZ", "title": "Correcting Nuisance Variation using Wasserstein Distance", "track": "main", "status": "Reject", "tldr": "We correct nuisance variation for image embeddings across different domains, preserving only relevant information.", "abstract": "Profiling cellular phenotypes from microscopic imaging can provide meaningful biological information resulting from various factors affecting the cells. One motivating application is drug development: morphological cell features can be captured from images, from which similarities between different drugs applied at different dosages can be quantified. The general approach is to find a function mapping the images to an embedding space of manageable dimensionality whose geometry captures relevant features of the input images. An important known issue for such methods is separating relevant biological signal from nuisance variation. For example, the embedding vectors tend to be more correlated for cells that were cultured and imaged during the same week than for cells from a different week, despite having identical drug compounds applied in both cases. In this case, the particular batch a set of experiments were conducted in constitutes the domain of the data; an ideal set of image embeddings should contain only the relevant biological information (e.g. drug effects). We develop a general framework for adjusting the image embeddings in order to `forget' domain-specific information while preserving relevant biological information. To do this, we minimize a loss function based on distances between marginal distributions (such as the Wasserstein distance) of embeddings across domains for each replicated treatment. For the dataset presented, the replicated treatment is the negative control. We find that for our transformed embeddings (1) the underlying geometric structure is not only preserved but the embeddings also carry improved biological signal (2) less domain-specific information is present.", "keywords": "Nuisance variation;transform learning;image embeddings", "primary_area": "", "supplementary_material": "", "author": "Gil Tabak;Minjie Fan;Samuel J. Yang;Stephan Hoyer;Geoff Davis", "authorids": "tabak.gil@gmail.com;mjfan@google.com;samuely@google.com;shoyer@google.com;geoffd@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ntabak2018correcting,\ntitle={Correcting Nuisance Variation using Wasserstein Distance},\nauthor={Gil Tabak and Minjie Fan and Samuel J. Yang and Stephan Hoyer and Geoff Davis},\nyear={2018},\nurl={https://openreview.net/forum?id=H18uzzWAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=H18uzzWAZ", "pdf_size": 0, "rating": "4;5;7", "confidence": "5;3;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 5, "corr_rating_confidence": -0.7559289460184546, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6718551769057528039&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "title": "Word translation without parallel data", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/336", "id": "H196sainb", "author_site": "Guillaume Lample, , Marc'Aurelio Ranzato, , Herv\u00e9 J\u00e9gou", "tldr": "Aligning languages without the Rosetta Stone: with no parallel data, we construct bilingual dictionaries using adversarial training, cross-domain local scaling, and an accurate proxy criterion for cross-validation.", "abstract": "State-of-the-art methods for learning cross-lingual word embeddings have relied on bilingual dictionaries or parallel corpora. Recent studies showed that the need for parallel data supervision can be alleviated with character-level information. While these methods showed encouraging results, they are not on par with their supervised counterparts and are limited to pairs of languages sharing a common alphabet. In this work, we show that we can build a bilingual dictionary between two languages without using any parallel corpora, by aligning monolingual word embedding spaces in an unsupervised way. Without using any character information, our model even outperforms existing supervised methods on cross-lingual tasks for some language pairs. Our experiments demonstrate that our method works very well also for distant language pairs, like English-Russian or English-Chinese. We finally describe experiments on the English-Esperanto low-resource language pair, on which there only exists a limited amount of parallel data, to show the potential impact of our method in fully unsupervised machine translation. Our code, embeddings and dictionaries are publicly available.", "keywords": "unsupervised learning;machine translation;multilingual embeddings;parallel dictionary induction;adversarial training", "primary_area": "", "supplementary_material": "", "author": "Guillaume Lample;Alexis Conneau;Marc'Aurelio Ranzato;Ludovic Denoyer;Herv\u00e9 J\u00e9gou", "authorids": "glample@fb.com;aconneau@fb.com;ranzato@fb.com;ludovic.denoyer@upmc.fr;rvj@fb.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nlample2018word,\ntitle={Word translation without parallel data},\nauthor={Guillaume Lample and Alexis Conneau and Marc'Aurelio Ranzato and Ludovic Denoyer and Herv\u00e9 J\u00e9gou},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H196sainb},\n}", "github": "[![github](/images/github_icon.svg) facebookresearch/MUSE](https://github.com/facebookresearch/MUSE) + [![Papers with Code](/images/pwc_icon.svg) 18 community implementations](https://paperswithcode.com/paper/?openreview=H196sainb)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "3;8;9", "confidence": "5;3;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 16, "authors#_avg": 5, "corr_rating_confidence": -0.777713771047819, "gs_citation": 1382, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10646845124593498896&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=H196sainb", "pdf": "https://openreview.net/pdf?id=H196sainb", "email": ";;;;", "author_num": 5 }, { "id": "H1A5ztj3b", "title": "Super-Convergence: Very Fast Training of Residual Networks Using Large Learning Rates", "track": "main", "status": "Reject", "tldr": "Empirical proof of a new phenomenon requires new theoretical insights and is relevent to the active discussions in the literature on SGD and understanding generalization.", "abstract": "In this paper, we show a phenomenon, which we named ``super-convergence'', where residual networks can be trained using an order of magnitude fewer iterations than is used with standard training methods. The existence of super-convergence is relevant to understanding why deep networks generalize well. One of the key elements of super-convergence is training with cyclical learning rates and a large maximum learning rate. Furthermore, we present evidence that training with large learning rates improves performance by regularizing the network. In addition, we show that super-convergence provides a greater boost in performance relative to standard training when the amount of labeled training data is limited. We also derive a simplification of the Hessian Free optimization method to compute an estimate of the optimal learning rate. The architectures to replicate this work will be made available upon publication.\n", "keywords": "Deep Learning;machine learning", "primary_area": "", "supplementary_material": "", "author": "Leslie N. Smith;Nicholay Topin", "authorids": "leslie.smith@nrl.navy.mil;ntopin1@umbc.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nn.2018superconvergence,\ntitle={Super-Convergence: Very Fast Training of Residual Networks Using Large Learning Rates},\nauthor={Leslie N. Smith and Nicholay Topin},\nyear={2018},\nurl={https://openreview.net/forum?id=H1A5ztj3b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=H1A5ztj3b", "pdf_size": 0, "rating": "4;4;4", "confidence": "3;3;4", "rating_avg": 4.0, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 1908, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2219536765070582068&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "H1BHbmWCZ", "title": "TOWARDS ROBOT VISION MODULE DEVELOPMENT WITH EXPERIENTIAL ROBOT LEARNING", "track": "main", "status": "Reject", "tldr": "3 thrusts serving as stepping stones for robot experiential learning of vision module", "abstract": "n this paper we present a thrust in three directions of visual development us- ing supervised and semi-supervised techniques. The first is an implementation of semi-supervised object detection and recognition using the principles of Soft At- tention and Generative Adversarial Networks (GANs). The second and the third are supervised networks that learn basic concepts of spatial locality and quantity respectively using Convolutional Neural Networks (CNNs). The three thrusts to- gether are based on the approach of Experiential Robot Learning, introduced in previous publication. While the results are unripe for implementation, we believe they constitute a stepping stone towards autonomous development of robotic vi- sual modules.", "keywords": "Deep Learning;Robotics;Artificial Intelligence;Computer Vision", "primary_area": "", "supplementary_material": "", "author": "Ahmed A Aly;Joanne Bechta Dugan", "authorids": "aaa2cn@virginia.edu;jbd@virginia.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\na2018towards,\ntitle={{TOWARDS} {ROBOT} {VISION} {MODULE} {DEVELOPMENT} {WITH} {EXPERIENTIAL} {ROBOT} {LEARNING}},\nauthor={Ahmed A Aly and Joanne Bechta Dugan},\nyear={2018},\nurl={https://openreview.net/forum?id=H1BHbmWCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=H1BHbmWCZ", "pdf_size": 0, "rating": "2;2;3", "confidence": "3;4;4", "rating_avg": 2.3333333333333335, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-I7ZfaIIIlcJ:scholar.google.com/&scioq=TOWARDS+ROBOT+VISION+MODULE+DEVELOPMENT+WITH+EXPERIENTIAL+ROBOT+LEARNING&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Generating Natural Adversarial Examples", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/142", "id": "H1BLjgZCb", "author_site": "Zhengli Zhao, Dheeru Dua, Sameer Singh", "tldr": "We propose a framework to generate \u201cnatural\u201d adversaries against black-box classifiers for both visual and textual domains, by doing the search for adversaries in the latent semantic space.", "abstract": "Due to their complex nature, it is hard to characterize the ways in which machine learning models can misbehave or be exploited when deployed. Recent work on adversarial examples, i.e. inputs with minor perturbations that result in substantially different model predictions, is helpful in evaluating the robustness of these models by exposing the adversarial scenarios where they fail. However, these malicious perturbations are often unnatural, not semantically meaningful, and not applicable to complicated domains such as language. In this paper, we propose a framework to generate natural and legible adversarial examples that lie on the data manifold, by searching in semantic space of dense and continuous data representation, utilizing the recent advances in generative adversarial networks. We present generated adversaries to demonstrate the potential of the proposed approach for black-box classifiers for a wide range of applications such as image classification, textual entailment, and machine translation. We include experiments to show that the generated adversaries are natural, legible to humans, and useful in evaluating and analyzing black-box classifiers.", "keywords": "adversarial examples;generative adversarial networks;interpretability;image classification;textual entailment;machine translation", "primary_area": "", "supplementary_material": "", "author": "Zhengli Zhao;Dheeru Dua;Sameer Singh", "authorids": "zhengliz@uci.edu;ddua@uci.edu;sameer@uci.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nzhao2018generating,\ntitle={Generating Natural Adversarial Examples},\nauthor={Zhengli Zhao and Dheeru Dua and Sameer Singh},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1BLjgZCb},\n}", "github": "[![github](/images/github_icon.svg) zhengliz/natural-adversary](https://github.com/zhengliz/natural-adversary)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;3;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 12, "authors#_avg": 3, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 747, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6487263081764376046&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=H1BLjgZCb", "pdf": "https://openreview.net/pdf?id=H1BLjgZCb", "email": ";;", "author_num": 3 }, { "id": "H1BO9M-0Z", "title": "Lifelong Word Embedding via Meta-Learning", "track": "main", "status": "Reject", "tldr": "learning better domain embeddings via lifelong learning and meta-learning", "abstract": "Learning high-quality word embeddings is of significant importance in achieving better performance in many down-stream learning tasks. On one hand, traditional word embeddings are trained on a large scale corpus for general-purpose tasks, which are often sub-optimal for many domain-specific tasks. On the other hand, many domain-specific tasks do not have a large enough domain corpus to obtain high-quality embeddings. We observe that domains are not isolated and a small domain corpus can leverage the learned knowledge from many past domains to augment that corpus in order to generate high-quality embeddings. In this paper, we formulate the learning of word embeddings as a lifelong learning process. Given knowledge learned from many previous domains and a small new domain corpus, the proposed method can effectively generate new domain embeddings by leveraging a simple but effective algorithm and a meta-learner, where the meta-learner is able to provide word context similarity information at the domain-level. Experimental results demonstrate that the proposed method can effectively learn new domain embeddings from a small corpus and past domain knowledges\\footnote{We will release the code after final revisions.}. We also demonstrate that general-purpose embeddings trained from a large scale corpus are sub-optimal in domain-specific tasks.", "keywords": "Lifelong learning;meta learning;word embedding", "primary_area": "", "supplementary_material": "", "author": "Hu Xu;Bing Liu;Lei Shu;Philip S. Yu", "authorids": "hxu48@uic.edu;liub@uic.edu;lshu3@uic.edu;psyu@uic.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nxu2018lifelong,\ntitle={Lifelong Word Embedding via Meta-Learning},\nauthor={Hu Xu and Bing Liu and Lei Shu and Philip S. Yu},\nyear={2018},\nurl={https://openreview.net/forum?id=H1BO9M-0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1BO9M-0Z", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11458291411395488455&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "H1DGha1CZ", "title": "Enhancing Batch Normalized Convolutional Networks using Displaced Rectifier Linear Units: A Systematic Comparative Study", "track": "main", "status": "Reject", "tldr": "A new activation function called Displaced Rectifier Linear Unit is proposed. It is showed to enhance the training and inference performance of batch normalized convolutional neural networks.", "abstract": "In this paper, we turn our attention to the interworking between the activation functions and the batch normalization, which is a virtually mandatory technique to train deep networks currently. We propose the activation function Displaced Rectifier Linear Unit (DReLU) by conjecturing that extending the identity function of ReLU to the third quadrant enhances compatibility with batch normalization. Moreover, we used statistical tests to compare the impact of using distinct activation functions (ReLU, LReLU, PReLU, ELU, and DReLU) on the learning speed and test accuracy performance of standardized VGG and Residual Networks state-of-the-art models. These convolutional neural networks were trained on CIFAR-100 and CIFAR-10, the most commonly used deep learning computer vision datasets. The results showed DReLU speeded up learning in all models and datasets. Besides, statistical significant performance assessments (p<0.05) showed DReLU enhanced the test accuracy presented by ReLU in all scenarios. Furthermore, DReLU showed better test accuracy than any other tested activation function in all experiments with one exception, in which case it presented the second best performance. Therefore, this work demonstrates that it is possible to increase performance replacing ReLU by an enhanced activation function.", "keywords": "Batch Normalized;Convolutional Neural Networks;Displaced Rectifier Linear Unit;Comparative Study", "primary_area": "", "supplementary_material": "", "author": "David Mac\u00eado;Cleber Zanchettin;Adriano L. I. Oliveira;Teresa Ludermir", "authorids": "dlm@cin.ufpe.br;cz@cin.ufpe.br;alio@cin.ufpe.br;tbl@cin.ufpe.br", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nmac\u00eado2018enhancing,\ntitle={Enhancing Batch Normalized Convolutional Networks using Displaced Rectifier Linear Units: A Systematic Comparative Study},\nauthor={David Mac\u00eado and Cleber Zanchettin and Adriano L. I. Oliveira and Teresa Ludermir},\nyear={2018},\nurl={https://openreview.net/forum?id=H1DGha1CZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=H1DGha1CZ", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;4;5", "rating_avg": 4.0, "confidence_avg": 4.666666666666667, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6156772290933184769&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "H1DJFybC-", "title": "Learning to Infer Graphics Programs from Hand-Drawn Images", "track": "main", "status": "Reject", "tldr": "Learn to convert a hand drawn sketch into a high-level program", "abstract": " We introduce a model that learns to convert simple hand drawings\n into graphics programs written in a subset of \\LaTeX.~The model\n combines techniques from deep learning and program synthesis. We\n learn a convolutional neural network that proposes plausible drawing\n primitives that explain an image. These drawing primitives are like\n a trace of the set of primitive commands issued by a graphics\n program. We learn a model that uses program synthesis techniques to\n recover a graphics program from that trace. These programs have\n constructs like variable bindings, iterative loops, or simple kinds\n of conditionals. With a graphics program in hand, we can correct\n errors made by the deep network and extrapolate drawings. Taken\n together these results are a step towards agents that induce useful,\n human-readable programs from perceptual input.", "keywords": "program induction;HCI;deep learning", "primary_area": "", "supplementary_material": "", "author": "Kevin Ellis;Daniel Ritchie;Armando Solar-Lezama;Joshua B. Tenenbaum", "authorids": "ellisk@mit.edu;daniel_richie@brown.edu;asolar@csail.mit.edu;jbt@mit.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nellis2018learning,\ntitle={Learning to Infer Graphics Programs from Hand-Drawn Images},\nauthor={Kevin Ellis and Daniel Ritchie and Armando Solar-Lezama and Joshua B. Tenenbaum},\nyear={2018},\nurl={https://openreview.net/forum?id=H1DJFybC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=H1DJFybC-", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;2;4", "rating_avg": 4.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.5, "gs_citation": 280, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14065112485794121024&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 16 }, { "id": "H1DkN7ZCZ", "title": "Deep learning mutation prediction enables early stage lung cancer detection in liquid biopsy", "track": "main", "status": "Workshop", "tldr": " Current somatic mutation methods do not work with liquid biopsies (ie low coverage sequencing), we apply a CNN architecture to a unique representation of a read and its ailgnment, we show significant improvement over previous methods in the low frequency setting.", "abstract": "Somatic cancer mutation detection at ultra-low variant allele frequencies (VAFs) is an unmet challenge that is intractable with current state-of-the-art mutation calling methods. Specifically, the limit of VAF detection is closely related to the depth of coverage, due to the requirement of multiple supporting reads in extant methods, precluding the detection of mutations at VAFs that are orders of magnitude lower than the depth of coverage. Nevertheless, the ability to detect cancer-associated mutations in ultra low VAFs is a fundamental requirement for low-tumor burden cancer diagnostics applications such as early detection, monitoring, and therapy nomination using liquid biopsy methods (cell-free DNA). Here we defined a spatial representation of sequencing information adapted for convolutional architecture that enables variant detection at VAFs, in a manner independent of the depth of sequencing. This method enables the detection of cancer mutations even in VAFs as low as 10x-4^, >2 orders of magnitude below the current state-of-the-art. We validated our method on both simulated plasma and on clinical cfDNA plasma samples from cancer patients and non-cancer controls. This method introduces a new domain within bioinformatics and personalized medicine \u2013 somatic whole genome mutation calling for liquid biopsy.", "keywords": "somatic mutation;variant calling;cancer;liquid biopsy;early detection;convolution;deep learning;machine learning;lung cancer;error suppression;mutect", "primary_area": "", "supplementary_material": "", "author": "Steven T. Kothen-Hill;Asaf Zviran;Rafael C. Schulman;Sunil Deochand;Federico Gaiti;Dillon Maloney;Kevin Y. Huang;Will Liao;Nicolas Robine;Nathaniel D. Omans;Dan A. Landau", "authorids": "sth2022@med.cornell.edu;azviran@nygenome.org;rschulman@nygenome.org;sdd325@nyu.edu;fgaiti@nygenome.org;dmaloney@nygenome.org;khuang@nygenome.org;wliao@nygenome.org;nrobine@nygenome.org;nao2013@med.cornell.edu;dal3005@med.cornell.edu", "gender": ";;;;;;;;;;", "homepage": ";;;;;;;;;;", "dblp": ";;;;;;;;;;", "google_scholar": ";;;;;;;;;;", "orcid": ";;;;;;;;;;", "linkedin": ";;;;;;;;;;", "or_profile": ";;;;;;;;;;", "aff": ";;;;;;;;;;", "aff_domain": ";;;;;;;;;;", "position": ";;;;;;;;;;", "bibtex": "@misc{\nt.2018deep,\ntitle={Deep learning mutation prediction enables early stage lung cancer detection in liquid biopsy},\nauthor={Steven T. Kothen-Hill and Asaf Zviran and Rafael C. Schulman and Sunil Deochand and Federico Gaiti and Dillon Maloney and Kevin Y. Huang and Will Liao and Nicolas Robine and Nathaniel D. Omans and Dan A. Landau},\nyear={2018},\nurl={https://openreview.net/forum?id=H1DkN7ZCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=H1DkN7ZCZ", "pdf_size": 0, "rating": "4;5;8", "confidence": "3;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 11, "corr_rating_confidence": 0.6933752452815364, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3508223998822102158&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "title": "Distributed Prioritized Experience Replay", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/134", "id": "H1Dy---0Z", "author_site": "Daniel Horgan, John Quan, David Budden, Gabriel Barth-maron, Matteo Hessel, Hado van Hasselt, David Silver", "tldr": "A distributed architecture for deep reinforcement learning at scale, using parallel data-generation to improve the state of the art on the Arcade Learning Environment benchmark in a fraction of the wall-clock training time of previous approaches.", "abstract": "We propose a distributed architecture for deep reinforcement learning at scale, that enables agents to learn effectively from orders of magnitude more data than previously possible. The algorithm decouples acting from learning: the actors interact with their own instances of the environment by selecting actions according to a shared neural network, and accumulate the resulting experience in a shared experience replay memory; the learner replays samples of experience and updates the neural network. The architecture relies on prioritized experience replay to focus only on the most significant data generated by the actors. Our architecture substantially improves the state of the art on the Arcade Learning Environment, achieving better final performance in a fraction of the wall-clock training time.", "keywords": "deep learning;reinforcement learning;distributed systems", "primary_area": "", "supplementary_material": "", "author": "Dan Horgan;John Quan;David Budden;Gabriel Barth-Maron;Matteo Hessel;Hado van Hasselt;David Silver", "authorids": "horgan@google.com;johnquan@google.com;budden@google.com;gabrielbm@google.com;mtthss@google.com;hado@google.com;davidsilver@google.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nhorgan2018distributed,\ntitle={Distributed Prioritized Experience Replay},\nauthor={Dan Horgan and John Quan and David Budden and Gabriel Barth-Maron and Matteo Hessel and Hado van Hasselt and David Silver},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1Dy---0Z},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 15 community implementations](https://paperswithcode.com/paper/?openreview=H1Dy---0Z)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;9", "confidence": "3;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 7, "corr_rating_confidence": 0.7559289460184545, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "openreview": "https://openreview.net/forum?id=H1Dy---0Z", "pdf": "https://openreview.net/pdf?id=H1Dy---0Z", "email": ";;;;;;", "author_num": 7 }, { "id": "H1I3M7Z0b", "title": "WSNet: Learning Compact and Efficient Networks with Weight Sampling", "track": "main", "status": "Workshop", "tldr": "We present a novel network architecture for learning compact and efficient deep neural networks", "abstract": "\tWe present a new approach and a novel architecture, termed WSNet, for learning compact and efficient deep neural networks. Existing approaches conventionally learn full model parameters independently and then compress them via \\emph{ad hoc} processing such as model pruning or filter factorization. Alternatively, WSNet proposes learning model parameters by sampling from a compact set of learnable parameters, which naturally enforces {parameter sharing} throughout the learning process. We demonstrate that such a novel weight sampling approach (and induced WSNet) promotes both weights and computation sharing favorably. By employing this method, we can more efficiently learn much smaller networks with competitive performance compared to baseline networks with equal numbers of convolution filters. Specifically, we consider learning compact and efficient 1D convolutional neural networks for audio classification. Extensive experiments on multiple audio classification datasets verify the effectiveness of WSNet. Combined with weight quantization, the resulted models are up to \\textbf{180$\\times$} smaller and theoretically up to \\textbf{16$\\times$} faster than the well-established baselines, without noticeable performance drop.", "keywords": "Deep learning;model compression", "primary_area": "", "supplementary_material": "", "author": "Xiaojie Jin;Yingzhen Yang;Ning Xu;Jianchao Yang;Jiashi Feng;Shuicheng Yan", "authorids": "xiaojie.jin@u.nus.edu;superyyzg@gmail.com;ning.xu@snap.com;jiachao.yang@snap.com;elefjia@nus.edu.sg;yanshuicheng@360.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\njin2018wsnet,\ntitle={{WSN}et: Learning Compact and Efficient Networks with Weight Sampling},\nauthor={Xiaojie Jin and Yingzhen Yang and Ning Xu and Jianchao Yang and Jiashi Feng and Shuicheng Yan},\nyear={2018},\nurl={https://openreview.net/forum?id=H1I3M7Z0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=H1I3M7Z0b", "pdf_size": 0, "rating": "5;6;6", "confidence": "5;4;3", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 6, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Z1he6JzoL0sJ:scholar.google.com/&scioq=WSNet:+Learning+Compact+and+Efficient+Networks+with+Weight+Sampling&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "H1K6Tb-AZ", "title": "TESLA: Task-wise Early Stopping and Loss Aggregation for Dynamic Neural Network Inference", "track": "main", "status": "Reject", "tldr": "", "abstract": "For inference operations in deep neural networks on end devices, it is desirable to deploy a single pre-trained neural network model, which can dynamically scale across a computation range without comprising accuracy. To achieve this goal, Incomplete Dot Product (IDP) has been proposed to use only a subset of terms in dot products during forward propagation. However, there are some limitations, including noticeable performance degradation in operating regions with low computational costs, and essential performance limitations since IDP uses hand-crafted profile coefficients. In this paper, we extend IDP by proposing new training algorithms involving a single profile, which may be trainable or pre-determined, to significantly improve the overall performance, especially in operating regions with low computational costs. Specifically, we propose the Task-wise Early Stopping and Loss Aggregation (TESLA) algorithm, which is showed in our 3-layer multilayer perceptron on MNIST that outperforms the original IDP by 32\\% when only 10\\% of dot products terms are used and achieves 94.7\\% accuracy on average. By introducing trainable profile coefficients, TESLA further improves the accuracy to 95.5\\% without specifying coefficients in advance. Besides, TESLA is applied to the VGG-16 model, which achieves 80\\% accuracy using only 20\\% of dot product terms on CIFAR-10 and also keeps 60\\% accuracy using only 30\\% of dot product terms on CIFAR-100, but the original IDP performs like a random guess in these two datasets at such low computation costs. Finally, we visualize the learned representations at different dot product percentages by class activation map and show that, by applying TESLA, the learned representations can adapt over a wide range of operation regions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chun-Min Chang;Chia-Ching Lin;Hung-Yi Ou Yang;Chin-Laung Lei;Kuan-Ta Chen", "authorids": "cmchang@iis.sinica.edu.tw;d05921018@ntu.edu.tw;frank840925@gmail.com;cllei@ntu.edu.tw;swc@iis.sinica.edu.tw", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nchang2018tesla,\ntitle={{TESLA}: Task-wise Early Stopping and Loss Aggregation for Dynamic Neural Network Inference},\nauthor={Chun-Min Chang and Chia-Ching Lin and Hung-Yi Ou Yang and Chin-Laung Lei and Kuan-Ta Chen},\nyear={2018},\nurl={https://openreview.net/forum?id=H1K6Tb-AZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=H1K6Tb-AZ", "pdf_size": 0, "rating": "4;4;5", "confidence": "2;4;2", "rating_avg": 4.333333333333333, "confidence_avg": 2.6666666666666665, "replies_avg": 7, "authors#_avg": 5, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:PB8KHJ4FQucJ:scholar.google.com/&scioq=TESLA:+Task-wise+Early+Stopping+and+Loss+Aggregation+for+Dynamic+Neural+Network+Inference&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "H1LAqMbRW", "title": "Latent forward model for Real-time Strategy game planning with incomplete information", "track": "main", "status": "Reject", "tldr": "The paper analyzes the latent space learned by model-free approaches in a miniature incomplete information game, trains a forward model in the latent space and apply it to Monte-Carlo Tree Search, yielding positive performance.", "abstract": "Model-free deep reinforcement learning approaches have shown superhuman performance in simulated environments (e.g., Atari games, Go, etc). During training, these approaches often implicitly construct a latent space that contains key information for decision making. In this paper, we learn a forward model on this latent space and apply it to model-based planning in miniature Real-time Strategy game with incomplete information (MiniRTS). We first show that the latent space constructed from existing actor-critic models contains relevant information of the game, and design training procedure to learn forward models. We also show that our learned forward model can predict meaningful future state and is usable for latent space Monte-Carlo Tree Search (MCTS), in terms of win rates against rule-based agents.", "keywords": "Real time strategy;latent space;forward model;monte carlo tree search;reinforcement learning;planning", "primary_area": "", "supplementary_material": "", "author": "Yuandong Tian;Qucheng Gong", "authorids": "yuandong@fb.com;qucheng@fb.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntian2018latent,\ntitle={Latent forward model for Real-time Strategy game planning with incomplete information},\nauthor={Yuandong Tian and Qucheng Gong},\nyear={2018},\nurl={https://openreview.net/forum?id=H1LAqMbRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=H1LAqMbRW", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;3;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6108105453138440196&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Understanding Short-Horizon Bias in Stochastic Meta-Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/240", "id": "H1MczcgR-", "author_site": "Yuhuai Wu, Mengye Ren, Renjie Liao, Roger Grosse", "tldr": "We investigate the bias in the short-horizon meta-optimization objective.", "abstract": "Careful tuning of the learning rate, or even schedules thereof, can be crucial to effective neural net training. There has been much recent interest in gradient-based meta-optimization, where one tunes hyperparameters, or even learns an optimizer, in order to minimize the expected loss when the training procedure is unrolled. But because the training procedure must be unrolled thousands of times, the meta-objective must be defined with an orders-of-magnitude shorter time horizon than is typical for neural net training. We show that such short-horizon meta-objectives cause a serious bias towards small step sizes, an effect we term short-horizon bias. We introduce a toy problem, a noisy quadratic cost function, on which we analyze short-horizon bias by deriving and comparing the optimal schedules for short and long time horizons. We then run meta-optimization experiments (both offline and online) on standard benchmark datasets, showing that meta-optimization chooses too small a learning rate by multiple orders of magnitude, even when run with a moderately long time horizon (100 steps) typical of work in the area. We believe short-horizon bias is a fundamental problem that needs to be addressed if meta-optimization is to scale to practical neural net training regimes.", "keywords": "meta-learning; optimization; short-horizon bias.", "primary_area": "", "supplementary_material": "", "author": "Yuhuai Wu;Mengye Ren;Renjie Liao;Roger Grosse.", "authorids": "ywu@cs.toronto.edu;mren@cs.toronto.edu;rjliao@cs.toronto.edu;rgrosse@cs.toronto.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nwu2018understanding,\ntitle={Understanding Short-Horizon Bias in Stochastic Meta-Optimization},\nauthor={Yuhuai Wu and Mengye Ren and Renjie Liao and Roger Grosse.},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1MczcgR-},\n}", "github": "[![github](/images/github_icon.svg) renmengye/meta-optim-public](https://github.com/renmengye/meta-optim-public)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;4;3", "rating_avg": 7.0, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 152, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10519066902248713180&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=H1MczcgR-", "pdf": "https://openreview.net/pdf?id=H1MczcgR-", "email": ";;;", "author_num": 4 }, { "id": "H1NV4agCb", "title": "Tracking Loss: Converting Object Detector to Robust Visual Tracker", "track": "main", "status": "Reject", "tldr": "We successfully convert a popular detector RPN to a well-performed tracker from the viewpoint of loss function.", "abstract": "In this paper, we find that by designing a novel loss function entitled, ''tracking loss'', Convolutional Neural Network (CNN) based object detectors can be successfully converted to well-performed visual trackers without any extra computational cost. This property is preferable to visual tracking where annotated video sequences for training are always absent, because rich features learned by detectors from still images could be utilized by dynamic trackers. It also avoids extra machinery such as feature engineering and feature aggregation proposed in previous studies. Tracking loss achieves this property by exploiting the internal structure of feature maps within the detection network and treating different feature points discriminatively. Such structure allows us to simultaneously consider discrimination quality and bounding box accuracy which is found to be crucial to the success. We also propose a network compression method to accelerate tracking speed without performance reduction. That also verifies tracking loss will remain highly effective even if the network is drastically compressed. Furthermore, if we employ a carefully designed tracking loss ensemble, the tracker would be much more robust and accurate. Evaluation results show that our trackers (including the ensemble tracker and two baseline trackers), outperform all state-of-the-art methods on VOT 2016 Challenge in terms of Expected Average Overlap (EAO) and robustness. We will make the code publicly available.", "keywords": "Object detection;Visual Tracking;Loss function;Region Proposal Network;Network compression", "primary_area": "", "supplementary_material": "", "author": "Zhenbin Yan;Jimmy Ren;Stephen Shaoyi Liao;Kai Yang", "authorids": "zhenb.yan@gmail.com;jimmy.sj.ren@gmail.com;issliao@cityu.edu.hk;kayang6-c@my.cityu.edu.hk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nyan2018tracking,\ntitle={Tracking Loss: Converting Object Detector to Robust Visual Tracker},\nauthor={Zhenbin Yan and Jimmy Ren and Stephen Shaoyi Liao and Kai Yang},\nyear={2018},\nurl={https://openreview.net/forum?id=H1NV4agCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=H1NV4agCb", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;4;4", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:v0Ieu8vT-C8J:scholar.google.com/&scioq=Tracking+Loss:+Converting+Object+Detector+to+Robust+Visual+Tracker&hl=en&as_sdt=0,33", "gs_version_total": 2 }, { "id": "H1Nyf7W0Z", "title": "Alpha-divergence bridges maximum likelihood and reinforcement learning in neural sequence generation", "track": "main", "status": "Reject", "tldr": "Propose new objective function for neural sequence generation which integrates ML-based and RL-based objective functions.", "abstract": "Neural sequence generation is commonly approached by using maximum- likelihood (ML) estimation or reinforcement learning (RL). However, it is known that they have their own shortcomings; ML presents training/testing discrepancy, whereas RL suffers from sample inefficiency. We point out that it is difficult to resolve all of the shortcomings simultaneously because of a tradeoff between ML and RL. In order to counteract these problems, we propose an objective function for sequence generation using \u03b1-divergence, which leads to an ML-RL integrated method that exploits better parts of ML and RL. We demonstrate that the proposed objective function generalizes ML and RL objective functions because it includes both as its special cases (ML corresponds to \u03b1 \u2192 0 and RL to \u03b1 \u2192 1). We provide a proposition stating that the difference between the RL objective function and the proposed one monotonically decreases with increasing \u03b1. Experimental results on machine translation tasks show that minimizing the proposed objective function achieves better sequence generation performance than ML-based methods.", "keywords": "neural network;reinforcement learning;natural language processing;machine translation;alpha-divergence", "primary_area": "", "supplementary_material": "", "author": "Sotetsu Koyamada;Yuta Kikuchi;Atsunori Kanemura;Shin-ichi Maeda;Shin Ishii", "authorids": "sotetsu.koyamada@gmail.com;sotetsu.koyamada@gmail.com;sotetsu.koyamada@gmail.com;sotetsu.koyamada@gmail.com;sotetsu.koyamada@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nkoyamada2018alphadivergence,\ntitle={Alpha-divergence bridges maximum likelihood and reinforcement learning in neural sequence generation},\nauthor={Sotetsu Koyamada and Yuta Kikuchi and Atsunori Kanemura and Shin-ichi Maeda and Shin Ishii},\nyear={2018},\nurl={https://openreview.net/forum?id=H1Nyf7W0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1Nyf7W0Z", "pdf_size": 0, "rating": "4;4;4", "confidence": "1;5;3", "rating_avg": 4.0, "confidence_avg": 3.0, "replies_avg": 4, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GTdDOayf0gcJ:scholar.google.com/&scioq=Alpha-divergence+bridges+maximum+likelihood+and+reinforcement+learning+in+neural+sequence+generation&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "H1O0KGC6b", "title": "Post-training for Deep Learning", "track": "main", "status": "Reject", "tldr": "We propose an additional training step, called post-training, which computes optimal weights for the last layer of the network.", "abstract": "One of the main challenges of deep learning methods is the choice of an appropriate training strategy. In particular, additional steps, such as unsupervised pre-training, have been shown to greatly improve the performances of deep structures. In this article, we propose an extra training step, called post-training, which only optimizes the last layer of the network. We show that this procedure can be analyzed in the context of kernel theory, with the first layers computing an embedding of the data and the last layer a statistical model to solve the task based on this embedding. This step makes sure that the embedding, or representation, of the data is used in the best possible way for the considered task. This idea is then tested on multiple architectures with various data sets, showing that it consistently provides a boost in performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Thomas Moreau;Julien Audiffren", "authorids": "thomas.moreau@cmla.ens-cachan.fr;julien.audiffren@cmla.ens-cachan.fr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nmoreau2018posttraining,\ntitle={Post-training for Deep Learning},\nauthor={Thomas Moreau and Julien Audiffren},\nyear={2018},\nurl={https://openreview.net/forum?id=H1O0KGC6b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=H1O0KGC6b", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;4;4", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": -0.8660254037844385, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "H1OQukZ0-", "title": "Online Hyper-Parameter Optimization", "track": "main", "status": "Reject", "tldr": "An algorithm for optimizing regularization hyper-parameters during training", "abstract": "We propose an efficient online hyperparameter optimization method which uses a joint dynamical system to evaluate the gradient with respect to the hyperparameters. While similar methods are usually limited to hyperparameters with a smooth impact on the model, we show how to apply it to the probability of dropout in neural networks. Finally, we show its effectiveness on two distinct tasks.", "keywords": "hyper-parameters;optimization", "primary_area": "", "supplementary_material": "", "author": "Damien Vincent;Sylvain Gelly;Nicolas Le Roux;Olivier Bousquet", "authorids": "damienv@google.com;sylvain.gelly@gmail.com;nicolas@le-roux.name;obousquet@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nvincent2018online,\ntitle={Online Hyper-Parameter Optimization},\nauthor={Damien Vincent and Sylvain Gelly and Nicolas Le Roux and Olivier Bousquet},\nyear={2018},\nurl={https://openreview.net/forum?id=H1OQukZ0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=H1OQukZ0-", "pdf_size": 0, "rating": "4;4;5", "confidence": "3;3;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.0, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "H1RPJf5Tz", "title": "Curiosity-driven Exploration by Bootstrapping Features", "track": "main", "status": "Withdraw", "tldr": "A simple intrinsic motivation method using forward dynamics model error in feature space of the policy.", "abstract": "We introduce CBF, an exploration method that works in the absence of rewards or end of episode signal. CBF is based on intrinsic reward derived from the error of a dynamics model operating in feature space. It was inspired by (Pathak et al., 2017), is easy to implement, and can achieve results such as passing four levels of Super Mario Bros, navigating VizDoom mazes and passing two levels of SpaceInvaders. We investigated the effect of combining the method with several auxiliary tasks, but find inconsistent improvements over the CBF baseline.\n", "keywords": "exploration;intrinsic motivation;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Harri Edwards;Yuri Burda;Amos Storkey", "authorids": "h.l.edwards@sms.ed.ac.uk;yburda@gmail.com;a.storkey@ed.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=H1RPJf5Tz", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 0, "authors#_avg": 3, "corr_rating_confidence": 0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8811494864385143681&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Deep Complex Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/2", "id": "H1T2hmZAb", "author_site": "Chiheb Trabelsi, Olexa Bilaniuk, Ying Zhang, Dmitriy Serdyuk, Sandeep Subramanian, Joao Felipe Santos, Soroush Mehri, Negar Rostamzadeh, Yoshua Bengio, Christopher Pal", "tldr": "", "abstract": "At present, the vast majority of building blocks, techniques, and architectures for deep learning are based on real-valued operations and representations. However, recent work on recurrent neural networks and older fundamental theoretical analysis suggests that complex numbers could have a richer representational capacity and could also facilitate noise-robust memory retrieval mechanisms. Despite their attractive properties and potential for opening up entirely new neural architectures, complex-valued deep neural networks have been marginalized due to the absence of the building blocks required to design such models. In this work, we provide the key atomic components for complex-valued deep neural networks and apply them to convolutional feed-forward networks. More precisely, we rely on complex convolutions and present algorithms for complex batch-normalization, complex weight initialization strategies for complex-valued neural nets and we use them in experiments with end-to-end training schemes. We demonstrate that such complex-valued models are competitive with their real-valued counterparts. We test deep complex models on several computer vision tasks, on music transcription using the MusicNet dataset and on Speech spectrum prediction using TIMIT. We achieve state-of-the-art performance on these audio-related tasks.", "keywords": "deep learning;complex-valued neural networks", "primary_area": "", "supplementary_material": "", "author": "Chiheb Trabelsi;Olexa Bilaniuk;Ying Zhang;Dmitriy Serdyuk;Sandeep Subramanian;Joao Felipe Santos;Soroush Mehri;Negar Rostamzadeh;Yoshua Bengio;Christopher J Pal", "authorids": "chiheb.trabelsi@polymtl.ca;olexa.bilaniuk@umontreal.ca;ying.zhang@umontreal.ca;serdyuk@iro.umontreal.ca;sandeep.subramanian.1@umontreal.ca;jfsantos@emt.inrs.ca;soroush.mehri@microsoft.com;negar@elementai.com;yoshua.bengio@umontreal.ca;christopher.pal@polymtl.ca", "gender": ";;;;;;;;;", "homepage": ";;;;;;;;;", "dblp": ";;;;;;;;;", "google_scholar": ";;;;;;;;;", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;;;", "or_profile": ";;;;;;;;;", "aff": ";;;;;;;;;", "aff_domain": ";;;;;;;;;", "position": ";;;;;;;;;", "bibtex": "@inproceedings{\ntrabelsi2018deep,\ntitle={Deep Complex Networks},\nauthor={Chiheb Trabelsi and Olexa Bilaniuk and Ying Zhang and Dmitriy Serdyuk and Sandeep Subramanian and Joao Felipe Santos and Soroush Mehri and Negar Rostamzadeh and Yoshua Bengio and Christopher J Pal},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1T2hmZAb},\n}", "github": "[![github](/images/github_icon.svg) ChihebTrabelsi/deep_complex_networks](https://github.com/ChihebTrabelsi/deep_complex_networks) + [![Papers with Code](/images/pwc_icon.svg) 8 community implementations](https://paperswithcode.com/paper/?openreview=H1T2hmZAb)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "4;7;8", "confidence": "4;4;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 10, "corr_rating_confidence": 0.0, "gs_citation": 1142, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18218729763326747000&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=H1T2hmZAb", "pdf": "https://openreview.net/pdf?id=H1T2hmZAb", "email": ";;;;;;;;;", "author_num": 10 }, { "id": "H1TWfmnNf", "title": "Do Convolutional Neural Networks act as Compositional Nearest Neighbors?", "track": "main", "status": "Withdraw", "tldr": "Convolutional Neural Networks behave as Compositional Nearest Neighbors!", "abstract": "We present a simple approach based on pixel-wise nearest neighbors to understand and interpret the functioning of state-of-the-art neural networks for pixel-level tasks. We aim to understand and uncover the synthesis/prediction mechanisms of state-of-the-art convolutional neural networks. To this end, we primarily analyze the synthesis process of generative models and the prediction mechanism of discriminative models. The main hypothesis of this work is that convolutional neural networks for pixel-level tasks learn a fast compositional nearest neighbor synthesis/prediction function. Our experiments on semantic segmentation and image-to-image translation show qualitative and quantitative evidence supporting this hypothesis.", "keywords": "interpreting convolutional neural networks;nearest neighbors;generative adversarial networks", "primary_area": "", "supplementary_material": "", "author": "Anonymous", "authorids": "ICLR.cc/2018/Conference/Paper1109/Authors", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@article{\n anonymous2018do,\n title={Do Convolutional Neural Networks act as Compositional Nearest Neighbors?},\n author={Anonymous},\n journal={International Conference on Learning Representations},\n year={2018},\n url={https://openreview.net/forum?id=By4Nxm-CW}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=H1TWfmnNf", "pdf_size": 0, "rating": "3;3;4", "confidence": "3;5;5", "rating_avg": 3.3333333333333335, "confidence_avg": 4.333333333333333, "replies_avg": 17, "authors#_avg": 1, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:o6aUfVhvelEJ:scholar.google.com/&scioq=Do+Convolutional+Neural+Networks+act+as+Compositional+Nearest+Neighbors%3F&hl=en&as_sdt=0,5", "gs_version_total": 2 }, { "title": "Interactive Grounded Language Acquisition and Generalization in a 2D World", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/275", "id": "H1UOm4gA-", "author_site": "Haonan Yu, Haichao Zhang, Wei Xu", "tldr": "Training an agent in a 2D virtual world for grounded language acquisition and generalization.", "abstract": "We build a virtual agent for learning language in a 2D maze-like world. The agent sees images of the surrounding environment, listens to a virtual teacher, and takes actions to receive rewards. It interactively learns the teacher\u2019s language from scratch based on two language use cases: sentence-directed navigation and question answering. It learns simultaneously the visual representations of the world, the language, and the action control. By disentangling language grounding from other computational routines and sharing a concept detection function between language grounding and prediction, the agent reliably interpolates and extrapolates to interpret sentences that contain new word combinations or new words missing from training sentences. The new words are transferred from the answers of language prediction. Such a language ability is trained and evaluated on a population of over 1.6 million distinct sentences consisting of 119 object words, 8 color words, 9 spatial-relation words, and 50 grammatical words. The proposed model significantly outperforms five comparison methods for interpreting zero-shot sentences. In addition, we demonstrate human-interpretable intermediate outputs of the model in the appendix.", "keywords": "grounded language learning and generalization;zero-shot language learning", "primary_area": "", "supplementary_material": "", "author": "Haonan Yu;Haichao Zhang;Wei Xu", "authorids": "haonanyu@baidu.com;zhanghaichao@baidu.com;wei.xu@baidu.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nyu2018interactive,\ntitle={Interactive Grounded Language Acquisition and Generalization in a 2D World},\nauthor={Haonan Yu and Haichao Zhang and Wei Xu},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1UOm4gA-},\n}", "github": "[![github](/images/github_icon.svg) PaddlePaddle/XWorld](https://github.com/PaddlePaddle/XWorld) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=H1UOm4gA-)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 87, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4696587271474463712&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=H1UOm4gA-", "pdf": "https://openreview.net/pdf?id=H1UOm4gA-", "email": ";;", "author_num": 3 }, { "id": "H1U_af-0-", "title": "Quadrature-based features for kernel approximation", "track": "main", "status": "Reject", "tldr": "Quadrature rules for kernel approximation.", "abstract": "We consider the problem of improving kernel approximation via feature maps. These maps arise as Monte Carlo approximation to integral representations of kernel functions and scale up kernel methods for larger datasets. We propose to use more efficient numerical integration technique to obtain better estimates of the integrals compared to the state-of-the-art methods. Our approach allows to use information about the integrand to enhance approximation and facilitates fast computations. We derive the convergence behavior and conduct an extensive empirical study that supports our hypothesis.", "keywords": "kernel methods;low-rank approximation;quadrature rules;random features", "primary_area": "", "supplementary_material": "", "author": "Marina Munkhoeva;Yermek Kapushev;Evgeny Burnaev;Ivan Oseledets", "authorids": "marina.munkhoeva@skolkovotech.ru;kapushev@gmail.com;e.burnaev@skoltech.ru;i.oseledets@skoltech.ru", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nmunkhoeva2018quadraturebased,\ntitle={Quadrature-based features for kernel approximation},\nauthor={Marina Munkhoeva and Yermek Kapushev and Evgeny Burnaev and Ivan Oseledets},\nyear={2018},\nurl={https://openreview.net/forum?id=H1U_af-0-},\n}", "github": "[![github](/images/github_icon.svg) maremun/quffka](https://github.com/maremun/quffka) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=H1U_af-0-)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1U_af-0-", "pdf_size": 0, "rating": "4;6;7", "confidence": "3;4;5", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": 0.9819805060619659, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5888962935039944528&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "title": "Enhancing The Reliability of Out-of-distribution Image Detection in Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/264", "id": "H1VGkIxRZ", "author_site": "R. Srikant, Shiyu Liang, Yixuan Li", "tldr": "", "abstract": "We consider the problem of detecting out-of-distribution images in neural networks. We propose ODIN, a simple and effective method that does not require any change to a pre-trained neural network. Our method is based on the observation that using temperature scaling and adding small perturbations to the input can separate the softmax score distributions of in- and out-of-distribution images, allowing for more effective detection. We show in a series of experiments that ODIN is compatible with diverse network architectures and datasets. It consistently outperforms the baseline approach by a large margin, establishing a new state-of-the-art performance on this task. For example, ODIN reduces the false positive rate from the baseline 34.7% to 4.3% on the DenseNet (applied to CIFAR-10 and Tiny-ImageNet) when the true positive rate is 95%.", "keywords": "Neural networks;out-of-distribution detection", "primary_area": "", "supplementary_material": "", "author": "Shiyu Liang;Yixuan Li;R. Srikant", "authorids": ";;rsrikant@illinois.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nliang2018enhancing,\ntitle={Enhancing The Reliability of Out-of-distribution Image Detection in Neural Networks},\nauthor={Shiyu Liang and Yixuan Li and R. Srikant},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1VGkIxRZ},\n}", "github": "[![github](/images/github_icon.svg) facebookresearch/odin](https://github.com/facebookresearch/odin) + [![Papers with Code](/images/pwc_icon.svg) 7 community implementations](https://paperswithcode.com/paper/?openreview=H1VGkIxRZ)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;9", "confidence": "4;3;3", "rating_avg": 7.0, "confidence_avg": 3.3333333333333335, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": -0.5, "gs_citation": 2556, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7536099354022278878&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=H1VGkIxRZ", "pdf": "https://openreview.net/pdf?id=H1VGkIxRZ", "email": ";;", "author_num": 3 }, { "title": "The Role of Minimal Complexity Functions in Unsupervised Learning of Semantic Mappings", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/154", "id": "H1VjBebR-", "author_site": "Tomer Galanti, Lior Wolf, Sagie Benaim", "tldr": "Our hypothesis is that given two domains, the lowest complexity mapping that has a low discrepancy approximates the target mapping.", "abstract": "We discuss the feasibility of the following learning problem: given unmatched samples from two domains and nothing else, learn a mapping between the two, which preserves semantics. Due to the lack of paired samples and without any definition of the semantic information, the problem might seem ill-posed. Specifically, in typical cases, it seems possible to build infinitely many alternative mappings from every target mapping. This apparent ambiguity stands in sharp contrast to the recent empirical success in solving this problem.\n\nWe identify the abstract notion of aligning two domains in a semantic way with concrete terms of minimal relative complexity. A theoretical framework for measuring the complexity of compositions of functions is developed in order to show that it is reasonable to expect the minimal complexity mapping to be unique. The measured complexity used is directly related to the depth of the neural networks being learned and a semantically aligned mapping could then be captured simply by learning using architectures that are not much bigger than the minimal architecture.\n\nVarious predictions are made based on the hypothesis that semantic alignment can be captured by the minimal mapping. These are verified extensively. In addition, a new mapping algorithm is proposed and shown to lead to better mapping results.", "keywords": "Unsupervised learning;cross-domain mapping;Kolmogorov complexity;Occam's razor", "primary_area": "", "supplementary_material": "", "author": "Tomer Galanti;Lior Wolf;Sagie Benaim", "authorids": "tomer22g@gmail.com;liorwolf@gmail.com;sagiebenaim@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ngalanti2018the,\ntitle={The Role of Minimal Complexity Functions in Unsupervised Learning of Semantic Mappings},\nauthor={Tomer Galanti and Lior Wolf and Sagie Benaim},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1VjBebR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;2;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4316719845057568544&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=H1VjBebR-", "pdf": "https://openreview.net/pdf?id=H1VjBebR-", "email": ";;", "author_num": 3 }, { "title": "Learning Approximate Inference Networks for Structured Prediction", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/75", "id": "H1WgVz-AZ", "author_site": "Lifu Tu, Kevin Gimpel", "tldr": "", "abstract": "Structured prediction energy networks (SPENs; Belanger & McCallum 2016) use neural network architectures to define energy functions that can capture arbitrary dependencies among parts of structured outputs. Prior work used gradient descent for inference, relaxing the structured output to a set of continuous variables and then optimizing the energy with respect to them. We replace this use of gradient descent with a neural network trained to approximate structured argmax inference. This\n\u201cinference network\u201d outputs continuous values that we treat as the output structure. We develop large-margin training criteria for joint training of the structured energy function and inference network. On multi-label classification we report speed-ups\nof 10-60x compared to (Belanger et al., 2017) while also improving accuracy. For sequence labeling with simple structured energies, our approach performs comparably to exact inference while being much faster at test time. We then demonstrate improved accuracy by augmenting the energy with a \u201clabel language model\u201d that scores entire output label sequences, showing it can improve handling of long-distance dependencies in part-of-speech tagging. Finally, we show how inference networks can replace dynamic programming for test-time inference in conditional random fields, suggestive for their general use for fast inference in structured settings.", "keywords": "Approximate Inference Networks;Structured Prediction;Multi-Label Classification;Sequence Labeling", "primary_area": "", "supplementary_material": "", "author": "Lifu Tu;Kevin Gimpel", "authorids": "lifu@ttic.edu;kgimpel@ttic.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\ntu2018learning,\ntitle={Learning Approximate Inference Networks for Structured Prediction},\nauthor={Lifu Tu and Kevin Gimpel},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1WgVz-AZ},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=H1WgVz-AZ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;7;9", "confidence": "3;5;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": 0.5, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15139166138025386131&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=H1WgVz-AZ", "pdf": "https://openreview.net/pdf?id=H1WgVz-AZ", "email": ";", "author_num": 2 }, { "id": "H1Ww66x0-", "title": "Lifelong Learning with Output Kernels", "track": "main", "status": "Reject", "tldr": "a novel approach for online lifelong learning using output kernels.", "abstract": "Lifelong learning poses considerable challenges in terms of effectiveness (minimizing prediction errors for all tasks) and overall computational tractability for real-time performance. This paper addresses continuous lifelong multitask learning by jointly re-estimating the inter-task relations (\\textit{output} kernel) and the per-task model parameters at each round, assuming data arrives in a streaming fashion. We propose a novel algorithm called \\textit{Online Output Kernel Learning Algorithm} (OOKLA) for lifelong learning setting. To avoid the memory explosion, we propose a robust budget-limited versions of the proposed algorithm that efficiently utilize the relationship between the tasks to bound the total number of representative examples in the support set. In addition, we propose a two-stage budgeted scheme for efficiently tackling the task-specific budget constraints in lifelong learning. Our empirical results over three datasets indicate superior AUC performance for OOKLA and its budget-limited cousins over strong baselines.", "keywords": "multitask learning;lifelong learning;online learning", "primary_area": "", "supplementary_material": "", "author": "Keerthiram Murugesan;Jaime Carbonell", "authorids": "kmuruges@cs.cmu.edu;jgc@cs.cmu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nmurugesan2018lifelong,\ntitle={Lifelong Learning with Output Kernels},\nauthor={Keerthiram Murugesan and Jaime Carbonell},\nyear={2018},\nurl={https://openreview.net/forum?id=H1Ww66x0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=H1Ww66x0-", "pdf_size": 0, "rating": "2;3;4", "confidence": "5;4;4", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YBnE-5bEG9YJ:scholar.google.com/&scioq=Lifelong+Learning+with+Output+Kernels&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Leveraging Grammar and Reinforcement Learning for Neural Program Synthesis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/294", "id": "H1Xw62kRZ", "author_site": "Rudy Bunel, Matthew Hausknecht, Jacob Devlin, Rishabh Singh, Pushmeet Kohli", "tldr": "Using the DSL grammar and reinforcement learning to improve synthesis of programs with complex control flow.", "abstract": "Program synthesis is the task of automatically generating a program consistent with\na specification. Recent years have seen proposal of a number of neural approaches\nfor program synthesis, many of which adopt a sequence generation paradigm similar\nto neural machine translation, in which sequence-to-sequence models are trained to\nmaximize the likelihood of known reference programs. While achieving impressive\nresults, this strategy has two key limitations. First, it ignores Program Aliasing: the\nfact that many different programs may satisfy a given specification (especially with\nincomplete specifications such as a few input-output examples). By maximizing\nthe likelihood of only a single reference program, it penalizes many semantically\ncorrect programs, which can adversely affect the synthesizer performance. Second,\nthis strategy overlooks the fact that programs have a strict syntax that can be\nefficiently checked. To address the first limitation, we perform reinforcement\nlearning on top of a supervised model with an objective that explicitly maximizes\nthe likelihood of generating semantically correct programs. For addressing the\nsecond limitation, we introduce a training procedure that directly maximizes the\nprobability of generating syntactically correct programs that fulfill the specification.\nWe show that our contributions lead to improved accuracy of the models, especially\nin cases where the training data is limited.", "keywords": "Program Synthesis;Reinforcement Learning;Language Model", "primary_area": "", "supplementary_material": "", "author": "Rudy Bunel;Matthew Hausknecht;Jacob Devlin;Rishabh Singh;Pushmeet Kohli", "authorids": "rudy@robots.ox.ac.uk;mahauskn@microsoft.com;jacobdevlin@google.com;risin@microsoft.com;pushmeet@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nbunel2018leveraging,\ntitle={Leveraging Grammar and Reinforcement Learning for Neural Program Synthesis},\nauthor={Rudy Bunel and Matthew Hausknecht and Jacob Devlin and Rishabh Singh and Pushmeet Kohli},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1Xw62kRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;6;7", "confidence": "3;3;3", "rating_avg": 6.0, "confidence_avg": 3.0, "replies_avg": 11, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 257, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14342984430007145123&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=H1Xw62kRZ", "pdf": "https://openreview.net/pdf?id=H1Xw62kRZ", "email": ";;;;", "author_num": 5 }, { "title": "Learning Sparse Neural Networks through L_0 Regularization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/222", "id": "H1Y8hhg0b", "author_site": "Christos Louizos, Max Welling, Diederik Kingma", "tldr": "We show how to optimize the expected L_0 norm of parametric models with gradient descent and introduce a new distribution that facilitates hard gating.", "abstract": "We propose a practical method for $L_0$ norm regularization for neural networks: pruning the network during training by encouraging weights to become exactly zero. Such regularization is interesting since (1) it can greatly speed up training and inference, and (2) it can improve generalization. AIC and BIC, well-known model selection criteria, are special cases of $L_0$ regularization. However, since the $L_0$ norm of weights is non-differentiable, we cannot incorporate it directly as a regularization term in the objective function. We propose a solution through the inclusion of a collection of non-negative stochastic gates, which collectively determine which weights to set to zero. We show that, somewhat surprisingly, for certain distributions over the gates, the expected $L_0$ regularized objective is differentiable with respect to the distribution parameters. We further propose the \\emph{hard concrete} distribution for the gates, which is obtained by ``stretching'' a binary concrete distribution and then transforming its samples with a hard-sigmoid. The parameters of the distribution over the gates can then be jointly optimized with the original network parameters. As a result our method allows for straightforward and efficient learning of model structures with stochastic gradient descent and allows for conditional computation in a principled way. We perform various experiments to demonstrate the effectiveness of the resulting approach and regularizer.", "keywords": "Sparsity;compression;hard and soft attention.", "primary_area": "", "supplementary_material": "", "author": "Christos Louizos;Max Welling;Diederik P. Kingma", "authorids": "c.louizos@uva.nl;m.welling@uva.nl;dpkingma@openai.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nlouizos2018learning,\ntitle={Learning Sparse Neural Networks through L_0 Regularization},\nauthor={Christos Louizos and Max Welling and Diederik P. Kingma},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1Y8hhg0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;3;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 1418, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16875065764676968506&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=H1Y8hhg0b", "pdf": "https://openreview.net/pdf?id=H1Y8hhg0b", "email": ";;", "author_num": 3 }, { "title": "An Online Learning Approach to Generative Adversarial Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/301", "id": "H1Yp-j1Cb", "author_site": "Paulina Grnarova, Kfir Y Levy, Aurelien Lucchi, Thomas Hofmann, Andreas Krause", "tldr": "", "abstract": "We consider the problem of training generative models with a Generative Adversarial Network (GAN). Although GANs can accurately model complex distributions, they are known to be difficult to train due to instabilities caused by a difficult minimax optimization problem. In this paper, we view the problem of training GANs as finding a mixed strategy in a zero-sum game. Building on ideas from online learning we propose a novel training method named Chekhov GAN. On the theory side, we show that our method provably converges to an equilibrium for semi-shallow GAN architectures, i.e. architectures where the discriminator is a one-layer network and the generator is arbitrary. On the practical side, we develop an efficient heuristic guided by our theoretical results, which we apply to commonly used deep GAN architectures.\nOn several real-world tasks our approach exhibits improved stability and performance compared to standard GAN training.", "keywords": "Generative Adversarial Networks;GANs;online learning", "primary_area": "", "supplementary_material": "", "author": "Paulina Grnarova;Kfir Y Levy;Aurelien Lucchi;Thomas Hofmann;Andreas Krause", "authorids": "paulina.grnarova@inf.ethz.ch;yehuda.levy@inf.ethz.ch;aurelien.lucchi@inf.ethz.ch;thomas.hofmann@inf.ethz.ch;krausea@ethz.ch", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ngrnarova2018an,\ntitle={An Online Learning Approach to Generative Adversarial Networks},\nauthor={Paulina Grnarova and Kfir Y Levy and Aurelien Lucchi and Thomas Hofmann and Andreas Krause},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1Yp-j1Cb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;7;8", "confidence": "4;4;5", "rating_avg": 6.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 12, "authors#_avg": 5, "corr_rating_confidence": 0.7559289460184544, "gs_citation": 92, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13905628345653607313&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=H1Yp-j1Cb", "pdf": "https://openreview.net/pdf?id=H1Yp-j1Cb", "email": ";;;;", "author_num": 5 }, { "id": "H1YynweCb", "title": "Kronecker Recurrent Units", "track": "main", "status": "Workshop", "tldr": "Out work presents a Kronecker factorization of recurrent weight matrices for parameter efficient and well conditioned recurrent neural networks.", "abstract": "Our work addresses two important issues with recurrent neural networks: (1) they are over-parameterized, and (2) the recurrent weight matrix is ill-conditioned. The former increases the sample complexity of learning and the training time. The latter causes the vanishing and exploding gradient problem. We present a flexible recurrent neural network model called Kronecker Recurrent Units (KRU). KRU achieves parameter efficiency in RNNs through a Kronecker factored recurrent matrix. It overcomes the ill-conditioning of the recurrent matrix by enforcing soft unitary constraints on the factors. Thanks to the small dimensionality of the factors, maintaining these constraints is computationally efficient. Our experimental results on seven standard data-sets reveal that KRU can reduce the number of parameters by three orders of magnitude in the recurrent weight matrix compared to the existing recurrent models, without trading the statistical performance. These results in particular show that while there are advantages in having a high dimensional recurrent space, the capacity of the recurrent part of the model can be dramatically reduced.", "keywords": "Recurrent neural network;Vanishing and exploding gradients;Parameter efficiency;Kronecker matrices;Soft unitary constraint", "primary_area": "", "supplementary_material": "", "author": "Cijo Jose;Moustapha Cisse;Francois Fleuret", "authorids": "cijo.jose@idiap.ch;moustaphacisse@fb.com;francois.fleuret@idiap.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\njose2018kronecker,\ntitle={Kronecker Recurrent Units},\nauthor={Cijo Jose and Moustapha Cisse and Francois Fleuret},\nyear={2018},\nurl={https://openreview.net/forum?id=H1YynweCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=H1YynweCb", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;5;3", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": -0.5, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14809178725284478631&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "H1a37GWCZ", "title": "UNSUPERVISED SENTENCE EMBEDDING USING DOCUMENT STRUCTURE-BASED CONTEXT", "track": "main", "status": "Reject", "tldr": "To train a sentence embedding using technical documents, our approach considers document structure to find broader context and handle out-of-vocabulary words.", "abstract": "We present a new unsupervised method for learning general-purpose sentence embeddings.\nUnlike existing methods which rely on local contexts, such as words\ninside the sentence or immediately neighboring sentences, our method selects, for\neach target sentence, influential sentences in the entire document based on a document\nstructure. We identify a dependency structure of sentences using metadata\nor text styles. Furthermore, we propose a novel out-of-vocabulary word handling\ntechnique to model many domain-specific terms, which were mostly discarded by\nexisting sentence embedding methods. We validate our model on several tasks\nshowing 30% precision improvement in coreference resolution in a technical domain,\nand 7.5% accuracy increase in paraphrase detection compared to baselines.", "keywords": "distributed representation;sentence embedding;structure;technical documents;sentence embedding;out-of-vocabulary", "primary_area": "", "supplementary_material": "", "author": "Taesung Lee;Youngja Park", "authorids": "taesung.lee@ibm.com;young_park@us.ibm.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlee2018unsupervised,\ntitle={{UNSUPERVISED} {SENTENCE} {EMBEDDING} {USING} {DOCUMENT} {STRUCTURE}-{BASED} {CONTEXT}},\nauthor={Taesung Lee and Youngja Park},\nyear={2018},\nurl={https://openreview.net/forum?id=H1a37GWCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=H1a37GWCZ", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7706315502028504655&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "title": "Active Learning for Convolutional Neural Networks: A Core-Set Approach", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/194", "id": "H1aIuk-RW", "author_site": "Ozan Sener, Silvio Savarese", "tldr": "We approach to the problem of active learning as a core-set selection problem and show that this approach is especially useful in the batch active learning setting which is crucial when training CNNs.", "abstract": "Convolutional neural networks (CNNs) have been successfully applied to many recognition and learning tasks using a universal recipe; training a deep model on a very large dataset of supervised examples. However, this approach is rather restrictive in practice since collecting a large set of labeled images is very expensive. One way to ease this problem is coming up with smart ways for choosing images to be labelled from a very large collection (i.e. active learning).\n\nOur empirical study suggests that many of the active learning heuristics in the literature are not effective when applied to CNNs when applied in batch setting. Inspired by these limitations, we define the problem of active learning as core-set selection, i.e. choosing set of points such that a model learned over the selected subset is competitive for the remaining data points. We further present a theoretical result characterizing the performance of any selected subset using the geometry of the datapoints. As an active learning algorithm, we choose the subset which is expected to yield best result according to our characterization. Our experiments show that the proposed method significantly outperforms existing approaches in image classification experiments by a large margin.\n", "keywords": "Active Learning;Convolutional Neural Networks;Core-Set Selection", "primary_area": "", "supplementary_material": "", "author": "Ozan Sener;Silvio Savarese", "authorids": "ozansener@cs.stanford.edu;ssilvio@stanford.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nsener2018active,\ntitle={Active Learning for Convolutional Neural Networks: A Core-Set Approach},\nauthor={Ozan Sener and Silvio Savarese},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1aIuk-RW},\n}", "github": "[![github](/images/github_icon.svg) ozansener/active_learning_coreset](https://github.com/ozansener/active_learning_coreset) + [![Papers with Code](/images/pwc_icon.svg) 9 community implementations](https://paperswithcode.com/paper/?openreview=H1aIuk-RW)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;3;4", "rating_avg": 7.0, "confidence_avg": 3.6666666666666665, "replies_avg": 16, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 2435, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11951024346317000591&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=H1aIuk-RW", "pdf": "https://openreview.net/pdf?id=H1aIuk-RW", "email": ";", "author_num": 2 }, { "id": "H1bM1fZCW", "title": "GradNorm: Gradient Normalization for Adaptive Loss Balancing in Deep Multitask Networks", "track": "main", "status": "Reject", "tldr": "We show how you can boost performance in a multitask network by tuning an adaptive multitask loss function that is learned through directly balancing network gradients.", "abstract": "Deep multitask networks, in which one neural network produces multiple predictive outputs, are more scalable and often better regularized than their single-task counterparts. Such advantages can potentially lead to gains in both speed and performance, but multitask networks are also difficult to train without finding the right balance between tasks. We present a novel gradient normalization (GradNorm) technique which automatically balances the multitask loss function by directly tuning the gradients to equalize task training rates. We show that for various network architectures, for both regression and classification tasks, and on both synthetic and real datasets, GradNorm improves accuracy and reduces overfitting over single networks, static baselines, and other adaptive multitask loss balancing techniques. GradNorm also matches or surpasses the performance of exhaustive grid search methods, despite only involving a single asymmetry hyperparameter $\\alpha$. Thus, what was once a tedious search process which incurred exponentially more compute for each task added can now be accomplished within a few training runs, irrespective of the number of tasks. Ultimately, we hope to demonstrate that gradient manipulation affords us great control over the training dynamics of multitask networks and may be one of the keys to unlocking the potential of multitask learning.", "keywords": "Multitask learning;computer vision;multitask loss function", "primary_area": "", "supplementary_material": "", "author": "Zhao Chen;Vijay Badrinarayanan;Chen-Yu Lee;Andrew Rabinovich", "authorids": "zchen@magicleap.com;vbadrinarayanan@magicleap.com;clee@magicleap.com;arabinovich@magicleap.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nchen2018gradnorm,\ntitle={GradNorm: Gradient Normalization for Adaptive Loss Balancing in Deep Multitask Networks},\nauthor={Zhao Chen and Vijay Badrinarayanan and Chen-Yu Lee and Andrew Rabinovich},\nyear={2018},\nurl={https://openreview.net/forum?id=H1bM1fZCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=H1bM1fZCW", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;4;2", "rating_avg": 4.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": -1.0, "gs_citation": 1615, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13835759043891121290&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "H1bhRHeA-", "title": "Unbiased scalable softmax optimization", "track": "main", "status": "Reject", "tldr": "Propose first methods for exactly optimizing the softmax distribution using stochastic gradient with runtime independent on the number of classes or datapoints.", "abstract": "Recent neural network and language models have begun to rely on softmax distributions with an extremely large number of categories. In this context calculating the softmax normalizing constant is prohibitively expensive. This has spurred a growing literature of efficiently computable but biased estimates of the softmax. In this paper we present the first two unbiased algorithms for maximizing the softmax likelihood whose work per iteration is independent of the number of classes and datapoints (and does not require extra work at the end of each epoch). We compare our unbiased methods' empirical performance to the state-of-the-art on seven real world datasets, where they comprehensively outperform all competitors.", "keywords": "softmax;optimization;implicit sgd", "primary_area": "", "supplementary_material": "", "author": "Francois Fagan;Garud Iyengar", "authorids": "ff2316@columbia.edu;garud@ieor.columbia.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nfagan2018unbiased,\ntitle={Unbiased scalable softmax optimization},\nauthor={Francois Fagan and Garud Iyengar},\nyear={2018},\nurl={https://openreview.net/forum?id=H1bhRHeA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=H1bhRHeA-", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;3;4", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6095938853455284981&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "H1cKvl-Rb", "title": "UCB EXPLORATION VIA Q-ENSEMBLES", "track": "main", "status": "Reject", "tldr": "Adapting UCB exploration to ensemble Q-learning improves over prior methods such as Double DQN, A3C+ on Atari benchmark", "abstract": "We show how an ensemble of $Q^*$-functions can be leveraged for more effective exploration in deep reinforcement learning. We build on well established algorithms from the bandit setting, and adapt them to the $Q$-learning setting. We propose an exploration strategy based on upper-confidence bounds (UCB). Our experiments show significant gains on the Atari benchmark. ", "keywords": "Reinforcement learning;Q-learning;ensemble method;upper confidence bound", "primary_area": "", "supplementary_material": "", "author": "Richard Y. Chen;Szymon Sidor;Pieter Abbeel;John Schulman", "authorids": "richardchen@openai.com;szymon@openai.com;pabbeel@cs.berkeley.edu;joschu@openai.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ny.2018ucb,\ntitle={{UCB} {EXPLORATION} {VIA} Q-{ENSEMBLES}},\nauthor={Richard Y. Chen and Szymon Sidor and Pieter Abbeel and John Schulman},\nyear={2018},\nurl={https://openreview.net/forum?id=H1cKvl-Rb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=H1cKvl-Rb", "pdf_size": 0, "rating": "5;6;7", "confidence": "3;5;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 16, "authors#_avg": 4, "corr_rating_confidence": 0.5, "gs_citation": 145, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13260404166621290240&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "H1cT3NTBM", "title": "Learning Audio Features for Singer Identification and Embedding", "track": "main", "status": "Withdraw", "tldr": "Using deep learning techniques on singing voice related tasks.", "abstract": "There has been an increasing use of neural networks for music information retrieval tasks. In this paper, we empirically investigate different ways of improving the performance of convolutional neural networks (CNNs) on spectral audio features. More specifically, we explore three aspects of CNN design: depth of the network, the use of residual blocks along with the use of grouped convolution, and global aggregation over time. The application context is singer classification and singing performance embedding and we believe the conclusions extend to other types of music analysis using convolutional neural networks. The results show that global time aggregation helps to improve the performance of CNNs the most. Another contribution of this paper is the release of a singing recording dataset that can be used for training and evaluation. ", "keywords": "convolution neural networks;attention;music information retrieval", "primary_area": "", "supplementary_material": "", "author": "Cheng-i Wang;George Tzanetakis", "authorids": "chw160@ucsd.edu;gtzan\u200b@cs\u200b.uvic\u200b.ca", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=H1cT3NTBM", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 0, "authors#_avg": 2, "corr_rating_confidence": 0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9546419711015678491&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Bi-Directional Block Self-Attention for Fast and Memory-Efficient Sequence Modeling", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/234", "id": "H1cWzoxA-", "author_site": "Tao Shen, Tianyi Zhou, Guodong Long, Jing Jiang, Chengqi Zhang", "tldr": "A self-attention network for RNN/CNN-free sequence encoding with small memory consumption, highly parallelizable computation and state-of-the-art performance on several NLP tasks", "abstract": "Recurrent neural networks (RNN), convolutional neural networks (CNN) and self-attention networks (SAN) are commonly used to produce context-aware representations. RNN can capture long-range dependency but is hard to parallelize and not time-efficient. CNN focuses on local dependency but does not perform well on some tasks. SAN can model both such dependencies via highly parallelizable computation, but memory requirement grows rapidly in line with sequence length. In this paper, we propose a model, called \"bi-directional block self-attention network (Bi-BloSAN)\", for RNN/CNN-free sequence encoding. It requires as little memory as RNN but with all the merits of SAN. Bi-BloSAN splits the entire sequence into blocks, and applies an intra-block SAN to each block for modeling local context, then applies an inter-block SAN to the outputs for all blocks to capture long-range dependency. Thus, each SAN only needs to process a short sequence, and only a small amount of memory is required. Additionally, we use feature-level attention to handle the variation of contexts around the same word, and use forward/backward masks to encode temporal order information. On nine benchmark datasets for different NLP tasks, Bi-BloSAN achieves or improves upon state-of-the-art accuracy, and shows better efficiency-memory trade-off than existing RNN/CNN/SAN. ", "keywords": "deep learning;attention mechanism;sequence modeling;natural language processing;sentence embedding", "primary_area": "", "supplementary_material": "", "author": "Tao Shen;Tianyi Zhou;Guodong Long;Jing Jiang;Chengqi Zhang", "authorids": "tao.shen@student.uts.edu.au;tianyizh@uw.edu;guodong.long@uts.edu.au;jing.jiang@uts.edu.au;chengqi.zhang@uts.edu.au", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nshen2018bidirectional,\ntitle={Bi-Directional Block Self-Attention for Fast and Memory-Efficient Sequence Modeling},\nauthor={Tao Shen and Tianyi Zhou and Guodong Long and Jing Jiang and Chengqi Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1cWzoxA-},\n}", "github": "[![github](/images/github_icon.svg) taoshen58/BiBloSA](https://github.com/taoshen58/BiBloSA)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4", "pdf_size": 0, "rating": "6;6;9", "confidence": "4;4;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 14, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 189, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7203374430207428965&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=H1cWzoxA-", "pdf": "https://openreview.net/pdf?id=H1cWzoxA-", "email": ";;;;", "author_num": 5 }, { "title": "TreeQN and ATreeC: Differentiable Tree-Structured Models for Deep Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/198", "id": "H1dh6Ax0Z", "author_site": "Gregory Farquhar, Tim Rocktaeschel, Maximilian Igl, Shimon Whiteson", "tldr": "We present TreeQN and ATreeC, new architectures for deep reinforcement learning in discrete-action domains that integrate differentiable on-line tree planning into the action-value function or policy.", "abstract": "Combining deep model-free reinforcement learning with on-line planning is a promising approach to building on the successes of deep RL. On-line planning with look-ahead trees has proven successful in environments where transition models are known a priori. However, in complex environments where transition models need to be learned from data, the deficiencies of learned models have limited their utility for planning. To address these challenges, we propose TreeQN, a differentiable, recursive, tree-structured model that serves as a drop-in replacement for any value function network in deep RL with discrete actions. TreeQN dynamically constructs a tree by recursively applying a transition model in a learned abstract state space and then aggregating predicted rewards and state-values using a tree backup to estimate Q-values. We also propose ATreeC, an actor-critic variant that augments TreeQN with a softmax layer to form a stochastic policy network. Both approaches are trained end-to-end, such that the learned model is optimised for its actual use in the tree. We show that TreeQN and ATreeC outperform n-step DQN and A2C on a box-pushing task, as well as n-step DQN and value prediction networks (Oh et al., 2017) on multiple Atari games. Furthermore, we present ablation studies that demonstrate the effect of different auxiliary losses on learning transition models.", "keywords": "reinforcement learning;deep learning;planning", "primary_area": "", "supplementary_material": "", "author": "Gregory Farquhar;Tim Rockt\u00e4schel;Maximilian Igl;Shimon Whiteson", "authorids": "gregory.farquhar@cs.ox.ac.uk;tim.rocktaeschel@gmail.com;maximilian.igl@gmail.com;shimon.whiteson@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nfarquhar2018treeqn,\ntitle={Tree{QN} and {AT}reeC: Differentiable Tree Planning for Deep Reinforcement Learning},\nauthor={Gregory Farquhar and Tim Rocktaeschel and Maximilian Igl and Shimon Whiteson},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1dh6Ax0Z},\n}", "github": "[![github](/images/github_icon.svg) oxwhirl/treeqn](https://github.com/oxwhirl/treeqn)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "4;5;8", "confidence": "5;3;5", "rating_avg": 5.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": 0.2773500981126145, "gs_citation": 161, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10647768083329764430&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=H1dh6Ax0Z", "pdf": "https://openreview.net/pdf?id=H1dh6Ax0Z", "email": ";;;", "author_num": 4 }, { "id": "H1eJxngCW", "title": "DuoRC: Towards Complex Language Understanding with Paraphrased Reading Comprehension", "track": "main", "status": "Workshop", "tldr": "We propose DuoRC, a novel dataset for Reading Comprehension (RC) containing 186,089 human-generated QA pairs created from a collection of 7680 pairs of parallel movie plots and introduce a RC task of reading one version of the plot and answering questions created from the other version; thus by design, requiring complex reasoning and deeper language understanding to overcome the poor lexical overlap between the plot and the question.", "abstract": "We propose DuoRC, a novel dataset for Reading Comprehension (RC) that motivates several new challenges for neural approaches in language understanding beyond those offered by existing RC datasets. DuoRC contains 186,089 unique question-answer pairs created from a collection of 7680 pairs of movie plots where each pair in the collection reflects two versions of the same movie - one from Wikipedia and the other from IMDb - written by two different authors. We asked crowdsourced workers to create questions from one version of the plot and a different set of workers to extract or synthesize corresponding answers from the other version. This unique characteristic of DuoRC where questions and answers are created from different versions of a document narrating the same underlying story, ensures by design, that there is very little lexical overlap between the questions created from one version and the segments containing the answer in the other version. Further, since the two versions have different level of plot detail, narration style, vocabulary, etc., answering questions from the second version requires deeper language understanding and incorporating background knowledge not available in the given text. Additionally, the narrative style of passages arising from movie plots (as opposed to typical descriptive passages in existing datasets) exhibits the need to perform complex reasoning over events across multiple sentences. Indeed, we observe that state-of-the-art neural RC models which have achieved near human performance on the SQuAD dataset, even when coupled with traditional NLP techniques to address the challenges presented in DuoRC exhibit very poor performance (F1 score of 37.42% on DuoRC v/s 86% on SQuAD dataset). This opens up several interesting research avenues wherein DuoRC could complement other Reading Comprehension style datasets to explore novel neural approaches for studying language understanding.", "keywords": "reading comprehension;question answering", "primary_area": "", "supplementary_material": "", "author": "Amrita Saha;Rahul Aralikatte;Mitesh M. Khapra;Karthik Sankaranarayanan", "authorids": "amrita.saha87@gmail.com;rahul.a.r@in.ibm.com;miteshk@cse.iitm.ac.in;kartsank@in.ibm.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsaha2018duorc,\ntitle={Duo{RC}: Towards Complex Language Understanding with Paraphrased Reading Comprehension},\nauthor={Amrita Saha and Rahul Aralikatte and Mitesh M. Khapra and Karthik Sankaranarayanan},\nyear={2018},\nurl={https://openreview.net/forum?id=H1eJxngCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=H1eJxngCW", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;3;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 161, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5661347701461811081&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "title": "Variational Inference of Disentangled Latent Concepts from Unlabeled Observations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/82", "id": "H1kG7GZAW", "author_site": "Abhishek Kumar, Prasanna Sattigeri, Avinash Balakrishnan", "tldr": "We propose a variational inference based approach for encouraging the inference of disentangled latents. We also propose a new metric for quantifying disentanglement. ", "abstract": "Disentangled representations, where the higher level data generative factors are reflected in disjoint latent dimensions, offer several benefits such as ease of deriving invariant representations, transferability to other tasks, interpretability, etc. We consider the problem of unsupervised learning of disentangled representations from large pool of unlabeled observations, and propose a variational inference based approach to infer disentangled latent factors. We introduce a regularizer on the expectation of the approximate posterior over observed data that encourages the disentanglement. We also propose a new disentanglement metric which is better aligned with the qualitative disentanglement observed in the decoder's output. We empirically observe significant improvement over existing methods in terms of both disentanglement and data likelihood (reconstruction quality). \n\n", "keywords": "disentangled representations;variational inference", "primary_area": "", "supplementary_material": "", "author": "Abhishek Kumar;Prasanna Sattigeri;Avinash Balakrishnan", "authorids": "abhishk@us.ibm.com;psattig@us.ibm.com;avinash.bala@us.ibm.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nkumar2018variational,\ntitle={{VARIATIONAL} {INFERENCE} {OF} {DISENTANGLED} {LATENT} {CONCEPTS} {FROM} {UNLABELED} {OBSERVATIONS}},\nauthor={Abhishek Kumar and Prasanna Sattigeri and Avinash Balakrishnan},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1kG7GZAW},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=H1kG7GZAW)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "5;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 607, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=84314681776183574&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=H1kG7GZAW", "pdf": "https://openreview.net/pdf?id=H1kG7GZAW", "email": ";;", "author_num": 3 }, { "id": "H1kMMmb0-", "title": "Sequential Coordination of Deep Models for Learning Visual Arithmetic", "track": "main", "status": "Reject", "tldr": "We use reinforcement learning to train an agent to solve a set of visual arithmetic tasks using provided pre-trained perceptual modules and transformations of internal representations created by those modules.", "abstract": "Achieving machine intelligence requires a smooth integration of perception and reasoning, yet models developed to date tend to specialize in one or the other; sophisticated manipulation of symbols acquired from rich perceptual spaces has so far proved elusive. Consider a visual arithmetic task, where the goal is to carry out simple arithmetical algorithms on digits presented under natural conditions (e.g. hand-written, placed randomly). We propose a two-tiered architecture for tackling this kind of problem. The lower tier consists of a heterogeneous collection of information processing modules, which can include pre-trained deep neural networks for locating and extracting characters from the image, as well as modules performing symbolic transformations on the representations extracted by perception. The higher tier consists of a controller, trained using reinforcement learning, which coordinates the modules in order to solve the high-level task. For instance, the controller may learn in what contexts to execute the perceptual networks and what symbolic transformations to apply to their outputs. The resulting model is able to solve a variety of tasks in the visual arithmetic domain,and has several advantages over standard, architecturally homogeneous feedforward networks including improved sample efficiency.", "keywords": "reinforcement learning;pretrained;deep learning;perception;algorithmic", "primary_area": "", "supplementary_material": "", "author": "Eric Crawford;Guillaume Rabusseau;Joelle Pineau", "authorids": "eric.crawford@mail.mcgill.ca;guillaume.rabusseau@mail.mcgill.ca;jpineau@cs.mcgill.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ncrawford2018sequential,\ntitle={Sequential Coordination of Deep Models for Learning Visual Arithmetic},\nauthor={Eric Crawford and Guillaume Rabusseau and Joelle Pineau},\nyear={2018},\nurl={https://openreview.net/forum?id=H1kMMmb0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1kMMmb0-", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MMj3DgjLu4sJ:scholar.google.com/&scioq=Sequential+Coordination+of+Deep+Models+for+Learning+Visual+Arithmetic&hl=en&as_sdt=0,5", "gs_version_total": 3 }, { "id": "H1l8sz-AW", "title": "Improving generalization by regularizing in $L^2$ function space", "track": "main", "status": "Reject", "tldr": "It's important to consider optimization in function space, not just parameter space. We introduce a learning rule that reduces distance traveled in function space, just like SGD limits distance traveled in parameter space.", "abstract": "Learning rules for neural networks necessarily include some form of regularization. Most regularization techniques are conceptualized and implemented in the space of parameters. However, it is also possible to regularize in the space of functions. Here, we propose to measure networks in an $L^2$ Hilbert space, and test a learning rule that regularizes the distance a network can travel through $L^2$-space each update. This approach is inspired by the slow movement of gradient descent through parameter space as well as by the natural gradient, which can be derived from a regularization term upon functional change. The resulting learning rule, which we call Hilbert-constrained gradient descent (HCGD), is thus closely related to the natural gradient but regularizes a different and more calculable metric over the space of functions. Experiments show that the HCGD is efficient and leads to considerably better generalization. ", "keywords": "natural gradient;generalization;optimization;function space;Hilbert", "primary_area": "", "supplementary_material": "", "author": "Ari S Benjamin;Konrad Kording", "authorids": "aarrii@seas.upenn.edu;aarrii@seas.upenn.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ns2018improving,\ntitle={Improving generalization by regularizing in $L^2$ function space},\nauthor={Ari S Benjamin and Konrad Kording},\nyear={2018},\nurl={https://openreview.net/forum?id=H1l8sz-AW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=H1l8sz-AW", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;4;3", "rating_avg": 5.0, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8039056337966042472&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "title": "Action-dependent Control Variates for Policy Optimization via Stein Identity", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/106", "id": "H1mCp-ZRZ", "author_site": "Hao Liu, Yihao Feng, Yi Mao, Dengyong Zhou, Jian Peng, Qiang Liu", "tldr": "", "abstract": "Policy gradient methods have achieved remarkable successes in solving challenging reinforcement learning problems. However, it still often suffers from the large variance issue on policy gradient estimation, which leads to poor sample efficiency during training. In this work, we propose a control variate method to effectively reduce variance for policy gradient methods. Motivated by the Stein\u2019s identity, our method extends the previous control variate methods used in REINFORCE and advantage actor-critic by introducing more flexible and general action-dependent baseline functions. Empirical studies show that our method essentially improves the sample efficiency of the state-of-the-art policy gradient approaches.\n", "keywords": "reinforcement learning;control variates;sample efficiency;variance reduction", "primary_area": "", "supplementary_material": "", "author": "Hao Liu*;Yihao Feng*;Yi Mao;Dengyong Zhou;Jian Peng;Qiang Liu", "authorids": "uestcliuhao@gmail.com;yihao@cs.utexas.edu;maoyi@microsoft.com;dennyzhou@google.com;jianpeng@illinois.edu;lqiang@cs.utexas.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nliu2018actiondependent,\ntitle={Action-dependent Control Variates for Policy Optimization via Stein Identity},\nauthor={Hao Liu and Yihao Feng and Yi Mao and Dengyong Zhou and Jian Peng and Qiang Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1mCp-ZRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;3;3", "rating_avg": 7.0, "confidence_avg": 3.3333333333333335, "replies_avg": 17, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 100, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17185260958337372478&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=H1mCp-ZRZ", "pdf": "https://openreview.net/pdf?id=H1mCp-ZRZ", "email": ";;;;;", "author_num": 6 }, { "title": "DCN+: Mixed Objective And Deep Residual Coattention for Question Answering", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/258", "id": "H1meywxRW", "author_site": "Caiming Xiong, richard socher, Victor Zhong", "tldr": "We introduce the DCN+ with deep residual coattention and mixed-objective RL, which achieves state of the art performance on the Stanford Question Answering Dataset.", "abstract": "Traditional models for question answering optimize using cross entropy loss, which encourages exact answers at the cost of penalizing nearby or overlapping answers that are sometimes equally accurate. We propose a mixed objective that combines cross entropy loss with self-critical policy learning, using rewards derived from word overlap to solve the misalignment between evaluation metric and optimization objective. In addition to the mixed objective, we introduce a deep residual coattention encoder that is inspired by recent work in deep self-attention and residual networks. Our proposals improve model performance across question types and input lengths, especially for long questions that requires the ability to capture long-term dependencies. On the Stanford Question Answering Dataset, our model achieves state of the art results with 75.1% exact match accuracy and 83.1% F1, while the ensemble obtains 78.9% exact match accuracy and 86.0% F1.", "keywords": "question answering;deep learning;natural language processing;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Caiming Xiong;Victor Zhong;Richard Socher", "authorids": "cxiong@salesforce.com;richard@socher.org;victor@victorzhong.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nxiong2018dcn,\ntitle={{DCN}+: Mixed Objective And Deep Residual Coattention for Question Answering},\nauthor={Caiming Xiong and Victor Zhong and Richard Socher},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1meywxRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;4;2", "rating_avg": 7.0, "confidence_avg": 3.3333333333333335, "replies_avg": 18, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844387, "gs_citation": 133, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5550299141473873647&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=H1meywxRW", "pdf": "https://openreview.net/pdf?id=H1meywxRW", "email": ";;", "author_num": 3 }, { "id": "H1pri9vTZ", "title": "Deep Function Machines: Generalized Neural Networks for Topological Layer Expression", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper we propose a generalization of deep neural networks called deep function machines (DFMs). DFMs act on vector spaces of arbitrary (possibly infinite) dimension and we show that a family of DFMs are invariant to the dimension of input data; that is, the parameterization of the model does not directly hinge on the quality of the input (eg. high resolution images). Using this generalization we provide a new theory of universal approximation of bounded non-linear operators between function spaces. We then suggest that DFMs provide an expressive framework for designing new neural network layer types with topological considerations in mind. Finally, we introduce a novel architecture, RippLeNet, for resolution invariant computer vision, which empirically achieves state of the art invariance.", "keywords": "deep learning theory;infinite neural networks;topology", "primary_area": "", "supplementary_material": "", "author": "William H. Guss", "authorids": "wguss@cs.cmu.edu", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nh.2018deep,\ntitle={Deep Function Machines: Generalized Neural Networks for Topological Layer Expression},\nauthor={William H. Guss},\nyear={2018},\nurl={https://openreview.net/forum?id=H1pri9vTZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1pri9vTZ", "pdf_size": 0, "rating": "3;4;7", "confidence": "4;3;1", "rating_avg": 4.666666666666667, "confidence_avg": 2.6666666666666665, "replies_avg": 7, "authors#_avg": 1, "corr_rating_confidence": -0.9958705948858225, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15570692444466956779&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "title": "A DIRT-T Approach to Unsupervised Domain Adaptation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/26", "id": "H1q-TM-AW", "author_site": "Rui Shu, Hung H Bui, Hirokazu Narui, Stefano Ermon", "tldr": "SOTA on unsupervised domain adaptation by leveraging the cluster assumption.", "abstract": "Domain adaptation refers to the problem of leveraging labeled data in a source domain to learn an accurate model in a target domain where labels are scarce or unavailable. A recent approach for finding a common representation of the two domains is via domain adversarial training (Ganin & Lempitsky, 2015), which attempts to induce a feature extractor that matches the source and target feature distributions in some feature space. However, domain adversarial training faces two critical limitations: 1) if the feature extraction function has high-capacity, then feature distribution matching is a weak constraint, 2) in non-conservative domain adaptation (where no single classifier can perform well in both the source and target domains), training the model to do well on the source domain hurts performance on the target domain. In this paper, we address these issues through the lens of the cluster assumption, i.e., decision boundaries should not cross high-density data regions. We propose two novel and related models: 1) the Virtual Adversarial Domain Adaptation (VADA) model, which combines domain adversarial training with a penalty term that punishes the violation the cluster assumption; 2) the Decision-boundary Iterative Refinement Training with a Teacher (DIRT-T) model, which takes the VADA model as initialization and employs natural gradient steps to further minimize the cluster assumption violation. Extensive empirical results demonstrate that the combination of these two models significantly improve the state-of-the-art performance on the digit, traffic sign, and Wi-Fi recognition domain adaptation benchmarks.", "keywords": "domain adaptation;unsupervised learning;semi-supervised learning", "primary_area": "", "supplementary_material": "", "author": "Rui Shu;Hung Bui;Hirokazu Narui;Stefano Ermon", "authorids": "ruishu@stanford.edu;buih@google.com;hirokaz2@stanford.edu;ermon@cs.stanford.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nshu2018a,\ntitle={A {DIRT}-T Approach to Unsupervised Domain Adaptation},\nauthor={Rui Shu and Hung Bui and Hirokazu Narui and Stefano Ermon},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1q-TM-AW},\n}", "github": "[![github](/images/github_icon.svg) RuiShu/dirt-t](https://github.com/RuiShu/dirt-t) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=H1q-TM-AW)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "2;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 758, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8960716763873957731&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=H1q-TM-AW", "pdf": "https://openreview.net/pdf?id=H1q-TM-AW", "email": ";;;", "author_num": 4 }, { "id": "H1rRWl-Cb", "title": "An information-theoretic analysis of deep latent-variable models", "track": "main", "status": "Reject", "tldr": "We provide an information theoretic and experimental analysis of state-of-the-art variational autoencoders.", "abstract": "We present an information-theoretic framework for understanding trade-offs in unsupervised learning of deep latent-variables models using variational inference. This framework emphasizes the need to consider latent-variable models along two dimensions: the ability to reconstruct inputs (distortion) and the communication cost (rate). We derive the optimal frontier of generative models in the two-dimensional rate-distortion plane, and show how the standard evidence lower bound objective is insufficient to select between points along this frontier. However, by performing targeted optimization to learn generative models with different rates, we are able to learn many models that can achieve similar generative performance but make vastly different trade-offs in terms of the usage of the latent variable. Through experiments on MNIST and Omniglot with a variety of architectures, we show how our framework sheds light on many recent proposed extensions to the variational autoencoder family.", "keywords": "information theory;generative models;latent variable models;variational autoencoders", "primary_area": "", "supplementary_material": "", "author": "Alex Alemi;Ben Poole;Ian Fischer;Josh Dillon;Rif A. Saurus;Kevin Murphy", "authorids": "alemi@google.com;poole@cs.stanford.edu;iansf@google.com;jvdillon@google.com;rif@google.com;kpmurphy@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nalemi2018an,\ntitle={An information-theoretic analysis of deep latent-variable models},\nauthor={Alex Alemi and Ben Poole and Ian Fischer and Josh Dillon and Rif A. Saurus and Kevin Murphy},\nyear={2018},\nurl={https://openreview.net/forum?id=H1rRWl-Cb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=H1rRWl-Cb", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;5;5", "rating_avg": 5.666666666666667, "confidence_avg": 4.666666666666667, "replies_avg": 10, "authors#_avg": 6, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6014015792815429555&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "title": "Learning From Noisy Singly-labeled Data", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/158", "id": "H1sUHgb0Z", "author_site": "Ashish Khetan, Zachary Lipton, anima anandkumar", "tldr": "A new approach for learning a model from noisy crowdsourced annotations.", "abstract": "Supervised learning depends on annotated examples, which are taken to be the ground truth. But these labels often come from noisy crowdsourcing platforms, like Amazon Mechanical Turk. Practitioners typically collect multiple labels per example and aggregate the results to mitigate noise (the classic crowdsourcing problem). Given a fixed annotation budget and unlimited unlabeled data, redundant annotation comes at the expense of fewer labeled examples. This raises two fundamental questions: (1) How can we best learn from noisy workers? (2) How should we allocate our labeling budget to maximize the performance of a classifier? We propose a new algorithm for jointly modeling labels and worker quality from noisy crowd-sourced data. The alternating minimization proceeds in rounds, estimating worker quality from disagreement with the current model and then updating the model by optimizing a loss function that accounts for the current estimate of worker quality. Unlike previous approaches, even with only one annotation per example, our algorithm can estimate worker quality. We establish a generalization error bound for models learned with our algorithm and establish theoretically that it's better to label many examples once (vs less multiply) when worker quality exceeds a threshold. Experiments conducted on both ImageNet (with simulated noisy workers) and MS-COCO (using the real crowdsourced labels) confirm our algorithm's benefits. ", "keywords": "crowdsourcing;noisy annotations;deep leaerning", "primary_area": "", "supplementary_material": "", "author": "Ashish Khetan;Zachary C. Lipton;Animashree Anandkumar", "authorids": "khetan2@illinois.edu;zlipton@cmu.edu;anima@amazon.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nkhetan2018learning,\ntitle={Learning From Noisy Singly-labeled Data},\nauthor={Ashish Khetan and Zachary C. Lipton and Anima Anandkumar},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1sUHgb0Z},\n}", "github": "[![github](/images/github_icon.svg) khetan2/MBEM](https://github.com/khetan2/MBEM)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 202, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1761205373572122420&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=H1sUHgb0Z", "pdf": "https://openreview.net/pdf?id=H1sUHgb0Z", "email": ";;", "author_num": 3 }, { "id": "H1srNebAZ", "title": "Discovering the mechanics of hidden neurons", "track": "main", "status": "Reject", "tldr": "We report experiments providing strong evidence that a neuron behaves like a binary classifier during training and testing", "abstract": "Neural networks trained through stochastic gradient descent (SGD) have been around for more than 30 years, but they still escape our understanding. This paper takes an experimental approach, with a divide-and-conquer strategy in mind: we start by studying what happens in single neurons. While being the core building block of deep neural networks, the way they encode information about the inputs and how such encodings emerge is still unknown. We report experiments providing strong evidence that hidden neurons behave like binary classifiers during training and testing. During training, analysis of the gradients reveals that a neuron separates two categories of inputs, which are impressively constant across training. During testing, we show that the fuzzy, binary partition described above embeds the core information used by the network for its prediction. These observations bring to light some of the core internal mechanics of deep neural networks, and have the potential to guide the next theoretical and practical developments.", "keywords": "deep learning;experimental analysis;hidden neurons", "primary_area": "", "supplementary_material": "", "author": "Simon Carbonnelle;Christophe De Vleeschouwer", "authorids": "simon.carbonnelle@uclouvain.be;christophe.devleeschouwer@uclouvain.be", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ncarbonnelle2018discovering,\ntitle={Discovering the mechanics of hidden neurons},\nauthor={Simon Carbonnelle and Christophe De Vleeschouwer},\nyear={2018},\nurl={https://openreview.net/forum?id=H1srNebAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=H1srNebAZ", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12717164529276233003&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Variance Reduction for Policy Gradient with Action-Dependent Factorized Baselines", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/115", "id": "H1tSsb-AW", "author_site": "Cathy Wu, Aravind Rajeswaran, Yan Duan, Vikash Kumar, Alexandre M Bayen, Sham M Kakade, Igor Mordatch, Pieter Abbeel", "tldr": "Action-dependent baselines can be bias-free and yield greater variance reduction than state-only dependent baselines for policy gradient methods.", "abstract": "Policy gradient methods have enjoyed great success in deep reinforcement learning but suffer from high variance of gradient estimates. The high variance problem is particularly exasperated in problems with long horizons or high-dimensional action spaces. To mitigate this issue, we derive a bias-free action-dependent baseline for variance reduction which fully exploits the structural form of the stochastic policy itself and does not make any additional assumptions about the MDP. We demonstrate and quantify the benefit of the action-dependent baseline through both theoretical analysis as well as numerical results, including an analysis of the suboptimality of the optimal state-dependent baseline. The result is a computationally efficient policy gradient algorithm, which scales to high-dimensional control problems, as demonstrated by a synthetic 2000-dimensional target matching task. Our experimental results indicate that action-dependent baselines allow for faster learning on standard reinforcement learning benchmarks and high-dimensional hand manipulation and synthetic tasks. Finally, we show that the general idea of including additional information in baselines for improved variance reduction can be extended to partially observed and multi-agent tasks.", "keywords": "reinforcement learning;policy gradient;variance reduction;baseline;control variates", "primary_area": "", "supplementary_material": "", "author": "Cathy Wu;Aravind Rajeswaran;Yan Duan;Vikash Kumar;Alexandre M Bayen;Sham Kakade;Igor Mordatch;Pieter Abbeel", "authorids": "cathywu@eecs.berkeley.edu;aravraj@cs.washington.edu;rockyduan@eecs.berkeley.edu;vikash@cs.washington.edu;bayen@berkeley.edu;sham@cs.washington.edu;igor.mordatch@gmail.com;pabbeel@cs.berkeley.edu", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\nwu2018variance,\ntitle={Variance Reduction for Policy Gradient with Action-Dependent Factorized Baselines},\nauthor={Cathy Wu and Aravind Rajeswaran and Yan Duan and Vikash Kumar and Alexandre M Bayen and Sham Kakade and Igor Mordatch and Pieter Abbeel},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1tSsb-AW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;4;3", "rating_avg": 7.0, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 8, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 187, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11001316948042198974&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=H1tSsb-AW", "pdf": "https://openreview.net/pdf?id=H1tSsb-AW", "email": ";;;;;;;", "author_num": 8 }, { "id": "H1u8fMW0b", "title": "Toward predictive machine learning for active vision", "track": "main", "status": "Reject", "tldr": "Pros and cons of saccade-based computer vision under a predictive coding perspective", "abstract": "We develop a comprehensive description of the active inference framework, as proposed by Friston (2010), under a machine-learning compliant perspective. Stemming from a biological inspiration and the auto-encoding principles, a sketch of a cognitive architecture is proposed that should provide ways to implement estimation-oriented control policies. Computer simulations illustrate the effectiveness of the approach through a foveated inspection of the input data. The pros and cons of the control policy are analyzed in detail, showing interesting promises in terms of processing compression. Though optimizing future posterior entropy over the actions set is shown enough to attain locally optimal action selection, offline calculation using class-specific saliency maps is shown better for it saves processing costs through saccades pathways pre-processing, with a negligible effect on the recognition/compression rates. ", "keywords": "active inference;predictive coding;motor control", "primary_area": "", "supplementary_material": "", "author": "Emmanuel Dauc\u00e9", "authorids": "emmanuel.dauce@centrale-marseille.fr", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\ndauc\u00e92018toward,\ntitle={Toward predictive machine learning for active vision},\nauthor={Emmanuel Dauc\u00e9},\nyear={2018},\nurl={https://openreview.net/forum?id=H1u8fMW0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=H1u8fMW0b", "pdf_size": 0, "rating": "3;3;5", "confidence": "5;4;2", "rating_avg": 3.6666666666666665, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 1, "corr_rating_confidence": -0.9449111825230683, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6834249937501123216&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "H1uP7ebAW", "title": "Learning to diagnose from scratch by exploiting dependencies among labels", "track": "main", "status": "Reject", "tldr": "we present the state-of-the-art results of using neural networks to diagnose chest x-rays", "abstract": "The field of medical diagnostics contains a wealth of challenges which closely resemble classical machine learning problems; practical constraints, however, complicate the translation of these endpoints naively into classical architectures. Many tasks in radiology, for example, are largely problems of multi-label classification wherein medical images are interpreted to indicate multiple present or suspected pathologies. Clinical settings drive the necessity for high accuracy simultaneously across a multitude of pathological outcomes and greatly limit the utility of tools which consider only a subset. This issue is exacerbated by a general scarcity of training data and maximizes the need to extract clinically relevant features from available samples -- ideally without the use of pre-trained models which may carry forward undesirable biases from tangentially related tasks. We present and evaluate a partial solution to these constraints in using LSTMs to leverage interdependencies among target labels in predicting 14 pathologic patterns from chest x-rays and establish state of the art results on the largest publicly available chest x-ray dataset from the NIH without pre-training. Furthermore, we propose and discuss alternative evaluation metrics and their relevance in clinical practice.", "keywords": "medical diagnosis;medical imaging;multi-label classification", "primary_area": "", "supplementary_material": "", "author": "Li Yao;Eric Poblenz;Dmitry Dagunts;Ben Covington;Devon Bernard;Kevin Lyman", "authorids": "li@enlitic.com;eric@enlitic.com;dmitry@enlitic.com;ben@enlitic.com;devon@entlic.com;kevin@enlitic.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nyao2018learning,\ntitle={Learning to diagnose from scratch by exploiting dependencies among labels},\nauthor={Li Yao and Eric Poblenz and Dmitry Dagunts and Ben Covington and Devon Bernard and Kevin Lyman},\nyear={2018},\nurl={https://openreview.net/forum?id=H1uP7ebAW},\n}", "github": "[![github](/images/github_icon.svg) yaoli/chest_xray_14](https://github.com/yaoli/chest_xray_14) + [![Papers with Code](/images/pwc_icon.svg) 9 community implementations](https://paperswithcode.com/paper/?openreview=H1uP7ebAW)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=H1uP7ebAW", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;3;3", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 7, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 456, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13314804248281810203&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "title": "Stochastic Activation Pruning for Robust Adversarial Defense", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/71", "id": "H1uR4GZRZ", "author_site": "Guneet Dhillon, Kamyar Azizzadenesheli, Zachary Lipton, Jeremy Bernstein, Jean Kossaifi, Aran Khanna, anima anandkumar", "tldr": "", "abstract": "Neural networks are known to be vulnerable to adversarial examples. Carefully chosen perturbations to real images, while imperceptible to humans, induce misclassification and threaten the reliability of deep learning systems in the wild. To guard against adversarial examples, we take inspiration from game theory and cast the problem as a minimax zero-sum game between the adversary and the model. In general, for such games, the optimal strategy for both players requires a stochastic policy, also known as a mixed strategy. In this light, we propose Stochastic Activation Pruning (SAP), a mixed strategy for adversarial defense. SAP prunes a random subset of activations (preferentially pruning those with smaller magnitude) and scales up the survivors to compensate. We can apply SAP to pretrained networks, including adversarially trained models, without fine-tuning, providing robustness against adversarial examples. Experiments demonstrate that SAP confers robustness against attacks, increasing accuracy and preserving calibration.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guneet S. Dhillon;Kamyar Azizzadenesheli;Zachary C. Lipton;Jeremy D. Bernstein;Jean Kossaifi;Aran Khanna;Animashree Anandkumar", "authorids": "guneetdhillon@utexas.edu;kazizzad@uci.edu;zlipton@cmu.edu;bernstein@caltech.edu;jean.kossaifi@gmail.com;arankhan@amazon.com;animakumar@gmail.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\ns.2018stochastic,\ntitle={Stochastic activation pruning for robust adversarial defense},\nauthor={Guneet S. Dhillon and Kamyar Azizzadenesheli and Jeremy D. Bernstein and Jean Kossaifi and Aran Khanna and Zachary C. Lipton and Animashree Anandkumar},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1uR4GZRZ},\n}", "github": "[![github](/images/github_icon.svg) Guneet-Dhillon/Stochastic-Activation-Pruning](https://github.com/Guneet-Dhillon/Stochastic-Activation-Pruning)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;3;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 7, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 729, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15156063651812856477&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=H1uR4GZRZ", "pdf": "https://openreview.net/pdf?id=H1uR4GZRZ", "email": ";;;;;;", "author_num": 7 }, { "id": "H1vCXOe0b", "title": "Interpreting Deep Classification Models With Bayesian Inference", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we propose a novel approach to interpret a well-trained classification model through systematically investigating effects of its hidden units on prediction making. We search for the core hidden units responsible for predicting inputs as the class of interest under the generative Bayesian inference framework. We model such a process of unit selection as an Indian Buffet Process, and derive a simplified objective function via the MAP asymptotic technique. The induced binary optimization problem is efficiently solved with a continuous relaxation method by attaching a Switch Gate layer to the hidden layers of interest. The resulted interpreter model is thus end-to-end optimized via standard gradient back-propagation. Experiments are conducted with two popular deep convolutional classifiers, respectively well-trained on the MNIST dataset and the CI- FAR10 dataset. The results demonstrate that the proposed interpreter successfully finds the core hidden units most responsible for prediction making. The modified model, only with the selected units activated, can hold correct predictions at a high rate. Besides, this interpreter model is also able to extract the most informative pixels in the images by connecting a Switch Gate layer to the input layer.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hanshu Yan;Jiashi Feng", "authorids": "eleyanh@nus.edu.sg;elefjia@nus.edu.sg", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nyan2018interpreting,\ntitle={Interpreting Deep Classification Models With Bayesian Inference},\nauthor={Hanshu Yan and Jiashi Feng},\nyear={2018},\nurl={https://openreview.net/forum?id=H1vCXOe0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1vCXOe0b", "pdf_size": 0, "rating": "3;3;5", "confidence": "4;3;3", "rating_avg": 3.6666666666666665, "confidence_avg": 3.3333333333333335, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": -0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8aMSLFq43gwJ:scholar.google.com/&scioq=Interpreting+Deep+Classification+Models+With+Bayesian+Inference&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Emergent Translation in Multi-Agent Communication", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/219", "id": "H1vEXaxA-", "author_site": "Jason Lee, Kyunghyun Cho, Jason Weston, Douwe Kiela", "tldr": "", "abstract": "While most machine translation systems to date are trained on large parallel corpora, humans learn language in a different way: by being grounded in an environment and interacting with other humans. In this work, we propose a communication game where two agents, native speakers of their own respective languages, jointly learn to solve a visual referential task. We find that the ability to understand and translate a foreign language emerges as a means to achieve shared goals. The emergent translation is interactive and multimodal, and crucially does not require parallel corpora, but only monolingual, independent text and corresponding images. Our proposed translation model achieves this by grounding the source and target languages into a shared visual modality, and outperforms several baselines on both word-level and sentence-level translation tasks. Furthermore, we show that agents in a multilingual community learn to translate better and faster than in a bilingual communication setting.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jason Lee;Kyunghyun Cho;Jason Weston;Douwe Kiela", "authorids": "jason@cs.nyu.edu;kyunghyun.cho@nyu.edu;jase@fb.com;dkiela@fb.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nlee2018emergent,\ntitle={Emergent Translation in Multi-Agent Communication},\nauthor={Jason Lee and Kyunghyun Cho and Jason Weston and Douwe Kiela},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1vEXaxA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;7;8", "confidence": "5;3;5", "rating_avg": 6.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": -0.18898223650461363, "gs_citation": 77, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16875774594076963034&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=H1vEXaxA-", "pdf": "https://openreview.net/pdf?id=H1vEXaxA-", "email": ";;;", "author_num": 4 }, { "id": "H1wt9x-RW", "title": "Interpretable and Pedagogical Examples", "track": "main", "status": "Reject", "tldr": "We show that training a student and teacher network iteratively, rather than jointly, can produce emergent, interpretable teaching strategies.", "abstract": "Teachers intentionally pick the most informative examples to show their students. However, if the teacher and student are neural networks, the examples that the teacher network learns to give, although effective at teaching the student, are typically uninterpretable. We show that training the student and teacher iteratively, rather than jointly, can produce interpretable teaching strategies. We evaluate interpretability by (1) measuring the similarity of the teacher's emergent strategies to intuitive strategies in each domain and (2) conducting human experiments to evaluate how effective the teacher's strategies are at teaching humans. We show that the teacher network learns to select or generate interpretable, pedagogical examples to teach rule-based, probabilistic, boolean, and hierarchical concepts.", "keywords": "machine teaching;interpretability;communication;cognitive science", "primary_area": "", "supplementary_material": "", "author": "Smitha Milli;Pieter Abbeel;Igor Mordatch", "authorids": "smilli@berkeley.edu;pabbeel@cs.berkeley.edu;igor.mordatch@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmilli2018interpretable,\ntitle={Interpretable and Pedagogical Examples},\nauthor={Smitha Milli and Pieter Abbeel and Igor Mordatch},\nyear={2018},\nurl={https://openreview.net/forum?id=H1wt9x-RW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=H1wt9x-RW", "pdf_size": 0, "rating": "4;8;8", "confidence": "3;4;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.5, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6840945339965469051&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "H1xJjlbAZ", "title": "INTERPRETATION OF NEURAL NETWORK IS FRAGILE", "track": "main", "status": "Reject", "tldr": "Can we trust a neural network's explanation for its prediction? We examine the robustness of several popular notions of interpretability of neural networks including saliency maps and influence functions and design adversarial examples against them.", "abstract": "In order for machine learning to be deployed and trusted in many applications, it is crucial to be able to reliably explain why the machine learning algorithm makes certain predictions. For example, if an algorithm classifies a given pathology image to be a malignant tumor, then the doctor may need to know which parts of the image led the algorithm to this classification. How to interpret black-box predictors is thus an important and active area of research. A fundamental question is: how much can we trust the interpretation itself? In this paper, we show that interpretation of deep learning predictions is extremely fragile in the following sense: two perceptively indistinguishable inputs with the same predicted label can be assigned very different}interpretations. We systematically characterize the fragility of the interpretations generated by several widely-used feature-importance interpretation methods (saliency maps, integrated gradient, and DeepLIFT) on ImageNet and CIFAR-10. Our experiments show that even small random perturbation can change the feature importance and new systematic perturbations can lead to dramatically different interpretations without changing the label. We extend these results to show that interpretations based on exemplars (e.g. influence functions) are similarly fragile. Our analysis of the geometry of the Hessian matrix gives insight on why fragility could be a fundamental challenge to the current interpretation approaches.", "keywords": "Adversarial Attack;Interpretability;Saliency Map;Influence Function;Robustness;Machine Learning;Deep Learning;Neural Network", "primary_area": "", "supplementary_material": "", "author": "Amirata Ghorbani;Abubakar Abid;James Zou", "authorids": "amiratag@stanford.edu;a12d@stanford.edu;jamesz@stanford.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nghorbani2018interpretation,\ntitle={{INTERPRETATION} {OF} {NEURAL} {NETWORK} {IS} {FRAGILE}},\nauthor={Amirata Ghorbani and Abubakar Abid and James Zou},\nyear={2018},\nurl={https://openreview.net/forum?id=H1xJjlbAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=H1xJjlbAZ", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;5;2", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 21, "authors#_avg": 3, "corr_rating_confidence": -0.6546536707079772, "gs_citation": 1064, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8913730552362106675&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12 }, { "id": "H1zRea1Mf", "title": "pix2code: Generating Code from a Graphical User Interface Screenshot", "track": "main", "status": "Withdraw", "tldr": "CNN and LSTM to generate markup-like code describing graphical user interface images.", "abstract": "Transforming a graphical user interface screenshot created by a designer into computer code is a typical task conducted by a developer in order to build customized software, websites, and mobile applications. In this paper, we show that deep learning methods can be leveraged to train a model end-to-end to automatically generate code from a single input image with over 77% of accuracy for three different platforms (i.e. iOS, Android and web-based technologies).", "keywords": "computer vision;scene understanding;text processing", "primary_area": "", "supplementary_material": "", "author": "Tony Beltramelli", "authorids": "tony@uizard.io", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1zRea1Mf", "pdf_size": 0, "rating": "2;5;5", "confidence": "5;4;4", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 3, "authors#_avg": 1, "corr_rating_confidence": -1.0, "gs_citation": 385, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8296741513177971931&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "title": "Hyperparameter optimization: a spectral approach", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/280", "id": "H1zriGeCZ", "author_site": "Elad Hazan, Adam Klivans, Yang Yuan", "tldr": "A hyperparameter tuning algorithm using discrete Fourier analysis and compressed sensing", "abstract": "We give a simple, fast algorithm for hyperparameter optimization inspired by techniques from the analysis of Boolean functions. We focus on the high-dimensional regime where the canonical example is training a neural network with a large number of hyperparameters. The algorithm --- an iterative application of compressed sensing techniques for orthogonal polynomials --- requires only uniform sampling of the hyperparameters and is thus easily parallelizable.\n \nExperiments for training deep neural networks on Cifar-10 show that compared to state-of-the-art tools (e.g., Hyperband and Spearmint), our algorithm finds significantly improved solutions, in some cases better than what is attainable by hand-tuning. In terms of overall running time (i.e., time required to sample various settings of hyperparameters plus additional computation time), we are at least an order of magnitude faster than Hyperband and Bayesian Optimization. We also outperform Random Search $8\\times$.\n \nOur method is inspired by provably-efficient algorithms for learning decision trees using the discrete Fourier transform. We obtain improved sample-complexty bounds for learning decision trees while matching state-of-the-art bounds on running time (polynomial and quasipolynomial, respectively). ", "keywords": "Hyperparameter Optimization;Fourier Analysis;Decision Tree;Compressed Sensing", "primary_area": "", "supplementary_material": "", "author": "Elad Hazan;Adam Klivans;Yang Yuan", "authorids": "ehazan@cs.princeton.edu;klivans@cs.utexas.edu;yangyuan@cs.cornell.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nhazan2018hyperparameter,\ntitle={Hyperparameter optimization: a spectral approach},\nauthor={Elad Hazan and Adam Klivans and Yang Yuan},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=H1zriGeCZ},\n}", "github": "[![github](/images/github_icon.svg) callowbird/Harmonica](https://github.com/callowbird/Harmonica)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;9", "confidence": "3;4;5", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844387, "gs_citation": 184, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11236398750787903780&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=H1zriGeCZ", "pdf": "https://openreview.net/pdf?id=H1zriGeCZ", "email": ";;", "author_num": 3 }, { "id": "HJ1HFlZAb", "title": "Evaluation of generative networks through their data augmentation capacity", "track": "main", "status": "Reject", "tldr": "Evaluating generative networks through their data augmentation capacity on discrimative models.", "abstract": "Generative networks are known to be difficult to assess. Recent works on generative models, especially on generative adversarial networks, produce nice samples of varied categories of images. But the validation of their quality is highly dependent on the method used. A good generator should generate data which contain meaningful and varied information and that fit the distribution of a dataset. This paper presents a new method to assess a generator. Our approach is based on training a classifier with a mixture of real and generated samples. We train a generative model over a labeled training set, then we use this generative model to sample new data points that we mix with the original training data. This mixture of real and generated data is thus used to train a classifier which is afterwards tested on a given labeled test dataset. We compare this result with the score of the same classifier trained on the real training data mixed with noise. By computing the classifier's accuracy with different ratios of samples from both distributions (real and generated) we are able to estimate if the generator successfully fits and is able to generalize the distribution of the dataset. Our experiments compare the result of different generators from the VAE and GAN framework on MNIST and fashion MNIST dataset.", "keywords": "Generative models;Evaluation of generative models;Data Augmentation", "primary_area": "", "supplementary_material": "", "author": "Timoth\u00e9e Lesort;Florian Bordes;Jean-Francois Goudou;David Filliat", "authorids": "t.lesort@gmail.com;florian.bordes@umontreal.ca;jean-francois.goudou@thalesgroup.com;david.filliat@ensta-paristech.fr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlesort2018evaluation,\ntitle={Evaluation of generative networks through their data augmentation capacity},\nauthor={Timoth\u00e9e Lesort and Florian Bordes and Jean-Francois Goudou and David Filliat},\nyear={2018},\nurl={https://openreview.net/forum?id=HJ1HFlZAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJ1HFlZAb", "pdf_size": 0, "rating": "3;3;5", "confidence": "5;5;3", "rating_avg": 3.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": -1.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16398394963588270018&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "HJ1ZxXbAM", "title": "DEEPCAST : UNIVERSAL TIME-SERIES FORECASTER", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Reliable and accurate time-series forecasting is critical in many fields including energy, finance, and manufacturing. Many time-series tasks, however, suffer from a limited amount of training data (i.e., the cold start problem) resulting in poor forecasting performance. Recently, convolutional neural networks (CNNs) have shown outstanding image classification performance even on tasks with small-scale training sets. The performance can be attributed to transfer learning through CNNs\u2019 ability to learn rich mid-level image representations. However, no prior work exists on general transfer learning for time-series forecasting. In this paper, motivated by recent success of transfer learning in CNN model and image-related tasks, we for the first time show how time-series representations learned with Long Short Term Memory (LSTM) on large-scale datasets can be efficiently transferred to other time-series forecasting tasks with limited amount of training data. We also validate that despite differences in time-series statistics and tasks in the datasets, the transferred representation leads to significantly improved forecasting results outperforming majority of the best time-series methods on the public M3 and other datasets. Our online universal forecasting tool, DeepCast, will leverage transfer learning to provide accurate forecasts for a diverse set of time series where classical methods were computationally infeasible or inapplicable due to short training history.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nikolay Laptev;Jiafan Yu;Ram Rajagopal", "authorids": "nlaptev@stanford.edu;joyjfy@gmail.com;ramr@stanford.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=HJ1ZxXbAM", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 0, "authors#_avg": 3, "corr_rating_confidence": 0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16753126775550923457&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "HJ39YKiTb", "title": "Associative Conversation Model: Generating Visual Information from Textual Information", "track": "main", "status": "Reject", "tldr": "Proposal of the sentence generation method based on fusion between textual information and visual information associated with the textual information", "abstract": "In this paper, we propose the Associative Conversation Model that generates visual information from textual information and uses it for generating sentences in order to utilize visual information in a dialogue system without image input. In research on Neural Machine Translation, there are studies that generate translated sentences using both images and sentences, and these studies show that visual information improves translation performance. However, it is not possible to use sentence generation algorithms using images for the dialogue systems since many text-based dialogue systems only accept text input. Our approach generates (associates) visual information from input text and generates response text using context vector fusing associative visual information and sentence textual information. A comparative experiment between our proposed model and a model without association showed that our proposed model is generating useful sentences by associating visual information related to sentences. Furthermore, analysis experiment of visual association showed that our proposed model generates (associates) visual information effective for sentence generation.", "keywords": "conversation model;multimodal embedding;attention mechanism;natural language processing;encoder-decoder model", "primary_area": "", "supplementary_material": "", "author": "Yoichi Ishibashi;Hisashi Miyamori", "authorids": "g1445539@cc.kyoto-su.ac.jp;miya@cc.kyoto-su.ac.jp", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nishibashi2018associative,\ntitle={Associative Conversation Model: Generating Visual Information from Textual Information},\nauthor={Yoichi Ishibashi and Hisashi Miyamori},\nyear={2018},\nurl={https://openreview.net/forum?id=HJ39YKiTb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJ39YKiTb", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;5;5", "rating_avg": 3.3333333333333335, "confidence_avg": 4.666666666666667, "replies_avg": 5, "authors#_avg": 2, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:a6BHlbJ7ep0J:scholar.google.com/&scioq=Associative+Conversation+Model:+Generating+Visual+Information+from+Textual+Information&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HJ3d2Ax0-", "title": "Benefits of Depth for Long-Term Memory of Recurrent Networks", "track": "main", "status": "Workshop", "tldr": "We propose a measure of long-term memory and prove that deep recurrent networks are much better fit to model long-term temporal dependencies than shallow ones.", "abstract": "The key attribute that drives the unprecedented success of modern Recurrent Neural Networks (RNNs) on learning tasks which involve sequential data, is their ever-improving ability to model intricate long-term temporal dependencies. However, a well established measure of RNNs' long-term memory capacity is lacking, and thus formal understanding of their ability to correlate data throughout time is limited. Though depth efficiency in convolutional networks is well established by now, it does not suffice in order to account for the success of deep RNNs on inputs of varying lengths, and the need to address their 'time-series expressive power' arises. In this paper, we analyze the effect of depth on the ability of recurrent networks to express correlations ranging over long time-scales. To meet the above need, we introduce a measure of the information flow across time that can be supported by the network, referred to as the Start-End separation rank. Essentially, this measure reflects the distance of the function realized by the recurrent network from a function that models no interaction whatsoever between the beginning and end of the input sequence. We prove that deep recurrent networks support Start-End separation ranks which are exponentially higher than those supported by their shallow counterparts. Moreover, we show that the ability of deep recurrent networks to correlate different parts of the input sequence increases exponentially as the input sequence extends, while that of vanilla shallow recurrent networks does not adapt to the sequence length at all. Thus, we establish that depth brings forth an overwhelming advantage in the ability of recurrent networks to model long-term dependencies, and provide an exemplar of quantifying this key attribute which may be readily extended to other RNN architectures of interest, e.g. variants of LSTM networks. We obtain our results by considering a class of recurrent networks referred to as Recurrent Arithmetic Circuits (RACs), which merge the hidden state with the input via the Multiplicative Integration operation.", "keywords": "recurrent neural networks;deep networks;correlations;long term memory;tensor networks;tensor analysis", "primary_area": "", "supplementary_material": "", "author": "Yoav Levine;Or Sharir;Amnon Shashua", "authorids": "yoavlevine@cs.huji.ac.il;or.sharir@cs.huji.ac.il;shashua@cs.huji.ac.il", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlevine2018benefits,\ntitle={Benefits of Depth for Long-Term Memory of Recurrent Networks},\nauthor={Yoav Levine and Or Sharir and Amnon Shashua},\nyear={2018},\nurl={https://openreview.net/forum?id=HJ3d2Ax0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=HJ3d2Ax0-", "pdf_size": 0, "rating": "5;6;7", "confidence": "2;3;3", "rating_avg": 6.0, "confidence_avg": 2.6666666666666665, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8355261120160301560&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "HJ4IhxZAb", "title": "Meta-Learning Transferable Active Learning Policies by Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Active learning (AL) aims to enable training high performance classifiers with low annotation cost by predicting which subset of unlabelled instances would be most beneficial to label. The importance of AL has motivated extensive research, proposing a wide variety of manually designed AL algorithms with diverse theoretical and intuitive motivations. In contrast to this body of research, we propose to treat active learning algorithm design as a meta-learning problem and learn the best criterion from data. We model an active learning algorithm as a deep neural network that inputs the base learner state and the unlabelled point set and predicts the best point to annotate next. Training this active query policy network with reinforcement learning, produces the best non-myopic policy for a given dataset. The key challenge in achieving a general solution to AL then becomes that of learner generalisation, particularly across heterogeneous datasets. We propose a multi-task dataset-embedding approach that allows dataset-agnostic active learners to be trained. Our evaluation shows that AL algorithms trained in this way can directly generalize across diverse problems.", "keywords": "Active Learning;Deep Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Kunkun Pang;Mingzhi Dong;Timothy Hospedales", "authorids": "k.pang@ed.ac.uk;mingzhi.dong.13@ucl.ac.uk;t.hospedales@ed.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\npang2018metalearning,\ntitle={Meta-Learning Transferable Active Learning Policies by Deep Reinforcement Learning},\nauthor={Kunkun Pang and Mingzhi Dong and Timothy Hospedales},\nyear={2018},\nurl={https://openreview.net/forum?id=HJ4IhxZAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HJ4IhxZAb", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;4;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 100, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17667950294364672079&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3 }, { "id": "HJ4YGZ-AW", "title": "A Neural-Symbolic Approach to Natural Language Tasks", "track": "main", "status": "Reject", "tldr": "This paper is intended to develop a tensor product representation approach for deep-learning-based natural language processinig applications.", "abstract": "Deep learning (DL) has in recent years been widely used in natural\nlanguage processing (NLP) applications due to its superior\nperformance. However, while natural languages are rich in\ngrammatical structure, DL has not been able to explicitly\nrepresent and enforce such structures. This paper proposes a new\narchitecture to bridge this gap by exploiting tensor product\nrepresentations (TPR), a structured neural-symbolic framework\ndeveloped in cognitive science over the past 20 years, with the\naim of integrating DL with explicit language structures and rules.\nWe call it the Tensor Product Generation Network\n(TPGN), and apply it to image captioning. The key\nideas of TPGN are: 1) unsupervised learning of\nrole-unbinding vectors of words via a TPR-based deep neural\nnetwork, and 2) integration of TPR with typical DL architectures\nincluding Long Short-Term Memory (LSTM) models. The novelty of our\napproach lies in its ability to generate a sentence and extract\npartial grammatical structure of the sentence by using\nrole-unbinding vectors, which are obtained in an unsupervised\nmanner. Experimental results demonstrate the effectiveness of the\nproposed approach.", "keywords": "Deep learning;tensor product representation;LSTM;image captioning", "primary_area": "", "supplementary_material": "", "author": "Qiuyuan Huang;Paul Smolensky;Xiaodong He;Li Deng;Dapeng Wu", "authorids": "idfree@ufl.edu;psmo@microsoft.com;xiaohe@microsoft.com;deng629@gmail.com;dpwu@ufl.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nhuang2018a,\ntitle={A Neural-Symbolic Approach to Natural Language Tasks},\nauthor={Qiuyuan Huang and Paul Smolensky and Xiaodong He and Li Deng and Dapeng Wu},\nyear={2018},\nurl={https://openreview.net/forum?id=HJ4YGZ-AW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HJ4YGZ-AW", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;5;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 6, "authors#_avg": 5, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14530335038798777736&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "HJ5AUm-CZ", "title": "The Variational Homoencoder: Learning to Infer High-Capacity Generative Models from Few Examples", "track": "main", "status": "Reject", "tldr": "Technique for learning deep generative models with shared latent variables, applied to Omniglot with a PixelCNN decoder.", "abstract": "Hierarchical Bayesian methods have the potential to unify many related tasks (e.g. k-shot classification, conditional, and unconditional generation) by framing each as inference within a single generative model. We show that existing approaches for learning such models can fail on expressive generative networks such as PixelCNNs, by describing the global distribution with little reliance on latent variables. To address this, we develop a modification of the Variational Autoencoder in which encoded observations are decoded to new elements from the same class; the result, which we call a Variational Homoencoder (VHE), may be understood as training a hierarchical latent variable model which better utilises latent variables in these cases. Using this framework enables us to train a hierarchical PixelCNN for the Omniglot dataset, outperforming all existing models on test set likelihood. With a single model we achieve both strong one-shot generation and near human-level classification, competitive with state-of-the-art discriminative classifiers. The VHE objective extends naturally to richer dataset structures such as factorial or hierarchical categories, as we illustrate by training models to separate character content from simple variations in drawing style, and to generalise the style of an alphabet to new characters.", "keywords": "generative models;one-shot learning;metalearning;pixelcnn;hierarchical bayesian;omniglot", "primary_area": "", "supplementary_material": "", "author": "Luke Hewitt;Andrea Gane;Tommi Jaakkola;Joshua B. Tenenbaum", "authorids": "lbh@mit.edu;agane@mit.edu;tommi@csail.mit.edu;jbt@mit.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nhewitt2018the,\ntitle={The Variational Homoencoder: Learning to Infer High-Capacity Generative Models from Few Examples},\nauthor={Luke Hewitt and Andrea Gane and Tommi Jaakkola and Joshua B. Tenenbaum},\nyear={2018},\nurl={https://openreview.net/forum?id=HJ5AUm-CZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJ5AUm-CZ", "pdf_size": 0, "rating": "5;6;7", "confidence": "5;3;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": -0.5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9930388720551928853&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "HJ8W1Q-0Z", "title": "GATED FAST WEIGHTS FOR ASSOCIATIVE RETRIEVAL", "track": "main", "status": "Reject", "tldr": "An improved Fast Weight network which shows better results on a general toy task.", "abstract": "We improve previous end-to-end differentiable neural networks (NNs) with fast\nweight memories. A gate mechanism updates fast weights at every time step of\na sequence through two separate outer-product-based matrices generated by slow\nparts of the net. The system is trained on a complex sequence to sequence variation\nof the Associative Retrieval Problem with roughly 70 times more temporal\nmemory (i.e. time-varying variables) than similar-sized standard recurrent NNs\n(RNNs). In terms of accuracy and number of parameters, our architecture outperforms\na variety of RNNs, including Long Short-Term Memory, Hypernetworks,\nand related fast weight architectures.", "keywords": "fast weights;RNN;associative retrieval;time-varying variables", "primary_area": "", "supplementary_material": "", "author": "Imanol Schlag;J\u00fcrgen Schmidhuber", "authorids": "imanol@idsia.ch;juergen@idsia.ch", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nschlag2018gated,\ntitle={{GATED} {FAST} {WEIGHTS} {FOR} {ASSOCIATIVE} {RETRIEVAL}},\nauthor={Imanol Schlag and J\u00fcrgen Schmidhuber},\nyear={2018},\nurl={https://openreview.net/forum?id=HJ8W1Q-0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HJ8W1Q-0Z", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;4;4", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:tS9QCqxzm8MJ:scholar.google.com/&scioq=GATED+FAST+WEIGHTS+FOR+ASSOCIATIVE+RETRIEVAL&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Rethinking the Smaller-Norm-Less-Informative Assumption in Channel Pruning of Convolution Layers", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/315", "id": "HJ94fqApW", "author_site": "Jianbo Ye, Xin Lu, Zhe Lin, James Z Wang", "tldr": "A CNN model pruning method using ISTA and rescaling trick to enforce sparsity of scaling parameters in batch normalization.", "abstract": "Model pruning has become a useful technique that improves the computational efficiency of deep learning, making it possible to deploy solutions in resource-limited scenarios. A widely-used practice in relevant work assumes that a smaller-norm parameter or feature plays a less informative role at the inference time. In this paper, we propose a channel pruning technique for accelerating the computations of deep convolutional neural networks (CNNs) that does not critically rely on this assumption. Instead, it focuses on direct simplification of the channel-to-channel computation graph of a CNN without the need of performing a computationally difficult and not-always-useful task of making high-dimensional tensors of CNN structured sparse. Our approach takes two stages: first to adopt an end-to-end stochastic training method that eventually forces the outputs of some channels to be constant, and then to prune those constant channels from the original neural network by adjusting the biases of their impacting layers such that the resulting compact model can be quickly fine-tuned. Our approach is mathematically appealing from an optimization perspective and easy to reproduce. We experimented our approach through several image learning benchmarks and demonstrate its interest- ing aspects and competitive performance.", "keywords": "model pruning;batch normalization;convolutional neural network;ISTA", "primary_area": "", "supplementary_material": "", "author": "Jianbo Ye;Xin Lu;Zhe Lin;James Z. Wang", "authorids": "jxy198@ist.psu.edu;xinl@adobe.com;zlin@adobe.com;jwang@ist.psu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nye2018rethinking,\ntitle={Rethinking the Smaller-Norm-Less-Informative Assumption in Channel Pruning of Convolution Layers},\nauthor={Jianbo Ye and Xin Lu and Zhe Lin and James Z. Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HJ94fqApW},\n}", "github": "[![github](/images/github_icon.svg) bobye/batchnorm_prune](https://github.com/bobye/batchnorm_prune) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=HJ94fqApW)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "5;6;7", "confidence": "5;5;3", "rating_avg": 6.0, "confidence_avg": 4.333333333333333, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 530, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17821725364773859726&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=HJ94fqApW", "pdf": "https://openreview.net/pdf?id=HJ94fqApW", "email": ";;;", "author_num": 4 }, { "id": "HJBhEMbRb", "title": "A Spectral Approach to Generalization and Optimization in Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "The recent success of deep neural networks stems from their ability to generalize well on real data; however, Zhang et al. have observed that neural networks can easily overfit random labels. This observation demonstrates that with the existing theory, we cannot adequately explain why gradient methods can find generalizable solutions for neural networks. In this work, we use a Fourier-based approach to study the generalization properties of gradient-based methods over 2-layer neural networks with sinusoidal activation functions. We prove that if the underlying distribution of data has nice spectral properties such as bandlimitedness, then the gradient descent method will converge to generalizable local minima. We also establish a Fourier-based generalization bound for bandlimited spaces, which generalizes to other activation functions. Our generalization bound motivates a grouped version of path norms for measuring the complexity of 2-layer neural networks with ReLU activation functions. We demonstrate numerically that regularization of this group path norm results in neural network solutions that can fit true labels without losing test accuracy while not overfitting random labels.", "keywords": "Generalization;Neural Networks;Fourier Analysis", "primary_area": "", "supplementary_material": "", "author": "Farzan Farnia;Jesse Zhang;David Tse", "authorids": "farnia@stanford.edu;jessez@stanford.edu;dntse@stanford.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nfarnia2018a,\ntitle={A Spectral Approach to Generalization and Optimization in Neural Networks},\nauthor={Farzan Farnia and Jesse Zhang and David Tse},\nyear={2018},\nurl={https://openreview.net/forum?id=HJBhEMbRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJBhEMbRb", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;3;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 16, "authors#_avg": 3, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16360849574616332292&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Sensitivity and Generalization in Neural Networks: an Empirical Study", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/65", "id": "HJC2SzZCW", "author_site": "Roman Novak, Yasaman Bahri, Daniel Abolafia, Jeffrey Pennington, Jascha Sohl-Dickstein", "tldr": "We perform massive experimental studies characterizing the relationships between Jacobian norms, linear regions, and generalization.", "abstract": "In practice it is often found that large over-parameterized neural networks generalize better than their smaller counterparts, an observation that appears to conflict with classical notions of function complexity, which typically favor smaller models. In this work, we investigate this tension between complexity and generalization through an extensive empirical exploration of two natural metrics of complexity related to sensitivity to input perturbations. Our experiments survey thousands of models with different architectures, optimizers, and other hyper-parameters, as well as four different image classification datasets.\n\nWe find that trained neural networks are more robust to input perturbations in the vicinity of the training data manifold, as measured by the input-output Jacobian of the network, and that this correlates well with generalization. We further establish that factors associated with poor generalization -- such as full-batch training or using random labels -- correspond to higher sensitivity, while factors associated with good generalization -- such as data augmentation and ReLU non-linearities -- give rise to more robust functions. Finally, we demonstrate how the input-output Jacobian norm can be predictive of generalization at the level of individual test points.", "keywords": "generalization;complexity;experimental study;linear regions;Jacobian", "primary_area": "", "supplementary_material": "", "author": "Roman Novak;Yasaman Bahri;Daniel A. Abolafia;Jeffrey Pennington;Jascha Sohl-Dickstein", "authorids": "romann@google.com;yasamanb@google.com;danabo@google.com;jpennin@google.com;jaschasd@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nnovak2018sensitivity,\ntitle={Sensitivity and Generalization in Neural Networks: an Empirical Study},\nauthor={Roman Novak and Yasaman Bahri and Daniel A. Abolafia and Jeffrey Pennington and Jascha Sohl-Dickstein},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HJC2SzZCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "4;5;8", "confidence": "5;3;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 17, "authors#_avg": 5, "corr_rating_confidence": -0.2401922307076307, "gs_citation": 545, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3792676156388255849&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=HJC2SzZCW", "pdf": "https://openreview.net/pdf?id=HJC2SzZCW", "email": ";;;;", "author_num": 5 }, { "title": "Hierarchical Density Order Embeddings", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/7", "id": "HJCXZQbAZ", "author_site": "Ben Athiwaratkun, Andrew Wilson", "tldr": "", "abstract": "By representing words with probability densities rather than point vectors, proba- bilistic word embeddings can capture rich and interpretable semantic information and uncertainty (Vilnis & McCallum, 2014; Athiwaratkun & Wilson, 2017). The uncertainty information can be particularly meaningful in capturing entailment relationships \u2013 whereby general words such as \u201centity\u201d correspond to broad distributions that encompass more specific words such as \u201canimal\u201d or \u201cinstrument\u201d. We introduce density order embeddings, which learn hierarchical representations through encapsulation of probability distributions. In particular, we propose simple yet effective loss functions and distance metrics, as well as graph-based schemes to select negative samples to better learn hierarchical probabilistic representations. Our approach provides state-of-the-art performance on the WordNet hypernym relationship prediction task and the challenging HyperLex lexical entailment dataset \u2013 while retaining a rich and interpretable probabilistic representation.", "keywords": "embeddings;word embeddings;probabilistic embeddings;hierarchical representation;probabilistic representation;order embeddings;wordnet;hyperlex", "primary_area": "", "supplementary_material": "", "author": "Ben Athiwaratkun;Andrew Gordon Wilson", "authorids": "pa338@cornell.edu;andrew@cornell.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nathiwaratkun2018on,\ntitle={On Modeling Hierarchical Data via Probabilistic Order Embeddings},\nauthor={Ben Athiwaratkun and Andrew Gordon Wilson},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HJCXZQbAZ},\n}", "github": "[![github](/images/github_icon.svg) benathi/density-order-emb](https://github.com/benathi/density-order-emb) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=HJCXZQbAZ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "4;6;8", "confidence": "3;4;5", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": 1.0, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12427920250451702495&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HJCXZQbAZ", "pdf": "https://openreview.net/pdf?id=HJCXZQbAZ", "email": ";", "author_num": 2 }, { "id": "HJDUjKeA-", "title": "Learning objects from pixels", "track": "main", "status": "Reject", "tldr": "We show how discrete objects can be learnt in an unsupervised fashion from pixels, and how to perform reinforcement learning using this object representation.", "abstract": "We show how discrete objects can be learnt in an unsupervised fashion from pixels, and how to perform reinforcement learning using this object representation.\n\nMore precisely, we construct a differentiable mapping from an image to a discrete tabular list of objects, where each object consists of a differentiable position, feature vector, and scalar presence value that allows the representation to be learnt using an attention mechanism.\n\nApplying this mapping to Atari games, together with an interaction net-style architecture for calculating quantities from objects, we construct agents that can play Atari games using objects learnt in an unsupervised fashion. During training, many natural objects emerge, such as the ball and paddles in Pong, and the submarine and fish in Seaquest.\n\nThis gives the first reinforcement learning agent for Atari with an interpretable object representation, and opens the avenue for agents that can conduct object-based exploration and generalization.", "keywords": "objects;unsupervised;reinforcement learning;atari", "primary_area": "", "supplementary_material": "", "author": "David Saxton", "authorids": "saxton@google.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nsaxton2018learning,\ntitle={Learning objects from pixels},\nauthor={David Saxton},\nyear={2018},\nurl={https://openreview.net/forum?id=HJDUjKeA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJDUjKeA-", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;4;3", "rating_avg": 3.6666666666666665, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 1, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xBEwPdAuTdYJ:scholar.google.com/&scioq=Learning+objects+from+pixels&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HJDV5YxCW", "title": "Heterogeneous Bitwidth Binarization in Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "We introduce fractional bitwidth approximation and show it has significant advantages.", "abstract": "Recent work has shown that performing inference with fast, very-low-bitwidth\n(e.g., 1 to 2 bits) representations of values in models can yield surprisingly accurate\nresults. However, although 2-bit approximated networks have been shown to\nbe quite accurate, 1 bit approximations, which are twice as fast, have restrictively\nlow accuracy. We propose a method to train models whose weights are a mixture\nof bitwidths, that allows us to more finely tune the accuracy/speed trade-off. We\npresent the \u201cmiddle-out\u201d criterion for determining the bitwidth for each value, and\nshow how to integrate it into training models with a desired mixture of bitwidths.\nWe evaluate several architectures and binarization techniques on the ImageNet\ndataset. We show that our heterogeneous bitwidth approximation achieves superlinear\nscaling of accuracy with bitwidth. Using an average of only 1.4 bits, we are\nable to outperform state-of-the-art 2-bit architectures.", "keywords": "Deep Learning;Computer Vision;Approximation", "primary_area": "", "supplementary_material": "", "author": "Josh Fromm;Matthai Philipose;Shwetak Patel", "authorids": "jwfromm@uw.edu;matthaip@microsoft.com;shwetak@cs.washington.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nfromm2018heterogeneous,\ntitle={Heterogeneous Bitwidth Binarization in Convolutional Neural Networks},\nauthor={Josh Fromm and Matthai Philipose and Shwetak Patel},\nyear={2018},\nurl={https://openreview.net/forum?id=HJDV5YxCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HJDV5YxCW", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1091815518844737300&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "id": "HJFcmshbM", "title": "DETECTING ADVERSARIAL PERTURBATIONS WITH SALIENCY", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this paper we propose novel method for detecting adversarial examples by train-ing a binary classifier with both origin data and saliency data. In the case of image classification model, saliency simply explain how the model make decisions by identifying significant pixels for prediction. Perturbing origin image is essentially perturbing saliency of right output w.r.t. origin image. Our approach shows good performance on detecting adversarial perturbations. We quantitatively evaluate generalization ability of the detector where detector trained with strong adver-saries and its\u2019 saliency perform well on weak adversaries. In addition, we further discuss relationship between solving adversary problem and model interpretation, which helps us understand how convolutional neural networks making wrong de-cisions.", "keywords": "Adversarial Examples;Detection;Saliency;Model Interpretation", "primary_area": "", "supplementary_material": "", "author": "Chiliang Zhang;Zuochang Ye;Bo Zhang;Deli Zhao", "authorids": "zhangcl16@mails.tsinghua.edu.cn;zuochang@tsinhua.edu.cn;zhangbo@xiaomi.com;zhaodeli@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HJFcmshbM", "pdf_size": 0, "rating": "3;4;4;4", "confidence": "5;4;4;4", "rating_avg": 3.75, "confidence_avg": 4.25, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": -1.0, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16284164334319915567&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6 }, { "title": "Training and Inference with Integers in Deep Neural Networks", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/330", "id": "HJGXzmspb", "author_site": "Shuang Wu, Guoqi Li, Feng Chen, Luping Shi", "tldr": "We apply training and inference with only low-bitwidth integers in DNNs", "abstract": "Researches on deep neural networks with discrete parameters and their deployment in embedded systems have been active and promising topics. Although previous works have successfully reduced precision in inference, transferring both training and inference processes to low-bitwidth integers has not been demonstrated simultaneously. In this work, we develop a new method termed as ``\"WAGE\" to discretize both training and inference, where weights (W), activations (A), gradients (G) and errors (E) among layers are shifted and linearly constrained to low-bitwidth integers. To perform pure discrete dataflow for fixed-point devices, we further replace batch normalization by a constant scaling layer and simplify other components that are arduous for integer implementation. Improved accuracies can be obtained on multiple datasets, which indicates that WAGE somehow acts as a type of regularization. Empirically, we demonstrate the potential to deploy training in hardware systems such as integer-based deep learning accelerators and neuromorphic chips with comparable accuracy and higher energy efficiency, which is crucial to future AI applications in variable scenarios with transfer and continual learning demands.", "keywords": "quantization;training;bitwidth;ternary weights", "primary_area": "", "supplementary_material": "", "author": "Shuang Wu;Guoqi Li;Feng Chen;Luping Shi", "authorids": "wus15@mails.tsinghua.edu.cn;liguoqi@mail.tsinghua.edu.cn;chenfeng@mail.tsinghua.edu.cn;lpshi@mail.tsinghua.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nwu2018training,\ntitle={Training and Inference with Integers in Deep Neural Networks},\nauthor={Shuang Wu and Guoqi Li and Feng Chen and Luping Shi},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HJGXzmspb},\n}", "github": "[![github](/images/github_icon.svg) boluoweifenda/WAGE](https://github.com/boluoweifenda/WAGE) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=HJGXzmspb)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;8", "confidence": "3;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 527, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15215054387477750278&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=HJGXzmspb", "pdf": "https://openreview.net/pdf?id=HJGXzmspb", "email": ";;;", "author_num": 4 }, { "id": "HJGcNz-0W", "title": "Convolutional Mesh Autoencoders for 3D Face Representation", "track": "main", "status": "Reject", "tldr": "Convolutional autoencoders generalized to mesh surfaces for encoding and reconstructing extreme 3D facial expressions.", "abstract": "Convolutional neural networks (CNNs) have achieved state of the art performance on recognizing and representing audio, images, videos and 3D volumes; that is, domains where the input can be characterized by a regular graph structure. \nHowever, generalizing CNNs to irregular domains like 3D meshes is challenging. Additionally, training data for 3D meshes is often limited. In this work, we generalize convolutional autoencoders to mesh surfaces. We perform spectral decomposition of meshes and apply convolutions directly in frequency space. In addition, we use max pooling and introduce upsampling within the network to represent meshes in a low dimensional space. We construct a complex dataset of 20,466 high resolution meshes with extreme facial expressions and encode it using our Convolutional Mesh Autoencoder. Despite limited training data, our method outperforms state-of-the-art PCA models of faces with 50% lower error, while using 75% fewer parameters.", "keywords": "meshes;convolutions;faces;autoencoder", "primary_area": "", "supplementary_material": "", "author": "Anurag Ranjan;Timo Bolkart;Michael J. Black", "authorids": "anurag.ranjan@tue.mpg.de;timo.bolkart@tuebingen.mpg.de;black@tuebingen.mpg.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nranjan2018convolutional,\ntitle={Convolutional Mesh Autoencoders for 3D Face Representation},\nauthor={Anurag Ranjan and Timo Bolkart and Michael J. Black},\nyear={2018},\nurl={https://openreview.net/forum?id=HJGcNz-0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJGcNz-0W", "pdf_size": 0, "rating": "2;4;6", "confidence": "5;4;3", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": -1.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8801450405199018418&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "title": "Emergence of Linguistic Communication from Referential Games with Symbolic and Pixel Input", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/138", "id": "HJGv1Z-AW", "author_site": "Angeliki Lazaridou, Karl M Hermann, Karl Tuyls, Stephen Clark", "tldr": "A controlled study of the role of environments with respect to properties in emergent communication protocols.", "abstract": "The ability of algorithms to evolve or learn (compositional) communication protocols has traditionally been studied in the language evolution literature through the use of emergent communication tasks. Here we scale up this research by using contemporary deep learning methods and by training reinforcement-learning neural network agents on referential communication games. We extend previous work, in which agents were trained in symbolic environments, by developing agents which are able to learn from raw pixel data, a more challenging and realistic input representation. We find that the degree of structure found in the input data affects the nature of the emerged protocols, and thereby corroborate the hypothesis that structured compositional language is most likely to emerge when agents perceive the world as being structured. ", "keywords": "disentanglement;communication;emergent language;compositionality;multi-agent", "primary_area": "", "supplementary_material": "", "author": "Angeliki Lazaridou;Karl Moritz Hermann;Karl Tuyls;Stephen Clark", "authorids": "angeliki@google.com;kmh@google.com;karltuyls@google.com;clarkstephen@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nlazaridou2018emergence,\ntitle={Emergence of Linguistic Communication from Referential Games with Symbolic and Pixel Input},\nauthor={Angeliki Lazaridou and Karl Moritz Hermann and Karl Tuyls and Stephen Clark},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HJGv1Z-AW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;7;9", "confidence": "4;4;5", "rating_avg": 7.0, "confidence_avg": 4.333333333333333, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 276, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12707373577928936905&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HJGv1Z-AW", "pdf": "https://openreview.net/pdf?id=HJGv1Z-AW", "email": ";;;", "author_num": 4 }, { "id": "HJIhGXWCZ", "title": "Prediction Under Uncertainty with Error Encoding Networks", "track": "main", "status": "Reject", "tldr": "A simple and easy to train method for multimodal prediction in time series. ", "abstract": "In this work we introduce a new framework for performing temporal predictions\nin the presence of uncertainty. It is based on a simple idea of disentangling com-\nponents of the future state which are predictable from those which are inherently\nunpredictable, and encoding the unpredictable components into a low-dimensional\nlatent variable which is fed into the forward model. Our method uses a simple su-\npervised training objective which is fast and easy to train. We evaluate it in the\ncontext of video prediction on multiple datasets and show that it is able to consi-\ntently generate diverse predictions without the need for alternating minimization\nover a latent space or adversarial training.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mikael Henaff;Junbo Zhao;Yann Lecun", "authorids": "mbh305@nyu.edu;j.zhao@nyu.edu;yann@cs.nyu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nhenaff2018prediction,\ntitle={Prediction Under Uncertainty with Error Encoding Networks},\nauthor={Mikael Henaff and Junbo Zhao and Yann Lecun},\nyear={2018},\nurl={https://openreview.net/forum?id=HJIhGXWCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HJIhGXWCZ", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;2;3", "rating_avg": 4.666666666666667, "confidence_avg": 3.0, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2525554113311580277&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Adversarial Dropout Regularization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/137", "id": "HJIoJWZCZ", "author_site": "Kuniaki Saito, Yoshitaka Ushiku, Tatsuya Harada, Kate Saenko", "tldr": "We present a new adversarial method for adapting neural representations based on a critic that detects non-discriminative features.", "abstract": "We present a domain adaptation method for transferring neural representations from label-rich source domains to unlabeled target domains. Recent adversarial methods proposed for this task learn to align features across domains by ``fooling'' a special domain classifier network. However, a drawback of this approach is that the domain classifier simply labels the generated features as in-domain or not, without considering the boundaries between classes. This means that ambiguous target features can be generated near class boundaries, reducing target classification accuracy. We propose a novel approach, Adversarial Dropout Regularization (ADR), which encourages the generator to output more discriminative features for the target domain. Our key idea is to replace the traditional domain critic with a critic that detects non-discriminative features by using dropout on the classifier network. The generator then learns to avoid these areas of the feature space and thus creates better features. We apply our ADR approach to the problem of unsupervised domain adaptation for image classification and semantic segmentation tasks, and demonstrate significant improvements over the state of the art.", "keywords": "domain adaptation;computer vision;generative models", "primary_area": "", "supplementary_material": "", "author": "Kuniaki Saito;Yoshitaka Ushiku;Tatsuya Harada;Kate Saenko", "authorids": "k-saito@mi.t.u-tokyo.ac.jp;ushiku@mi.t.u-tokyo.ac.jp;harada@mi.t.u-tokyo.ac.jp;saenko@bu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nsaito2018adversarial,\ntitle={Adversarial Dropout Regularization},\nauthor={Kuniaki Saito and Yoshitaka Ushiku and Tatsuya Harada and Kate Saenko},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HJIoJWZCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "5;7;8", "confidence": "4;3;5", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 15, "authors#_avg": 4, "corr_rating_confidence": 0.3273268353539886, "gs_citation": 347, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6341114078934760238&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HJIoJWZCZ", "pdf": "https://openreview.net/pdf?id=HJIoJWZCZ", "email": ";;;", "author_num": 4 }, { "id": "HJJ0w--0W", "title": "Long-term Forecasting using Tensor-Train RNNs", "track": "main", "status": "Reject", "tldr": "Accurate forecasting over very long time horizons using tensor-train RNNs", "abstract": "We present Tensor-Train RNN (TT-RNN), a novel family of neural sequence architectures for multivariate forecasting in environments with nonlinear dynamics. Long-term forecasting in such systems is highly challenging, since there exist long-term temporal dependencies, higher-order correlations and sensitivity to error propagation. Our proposed tensor recurrent architecture addresses these issues by learning the nonlinear dynamics directly using higher order moments and high-order state transition functions. Furthermore, we decompose the higher-order structure using the tensor-train (TT) decomposition to reduce the number of parameters while preserving the model performance. We theoretically establish the approximation properties of Tensor-Train RNNs for general sequence inputs, and such guarantees are not available for usual RNNs. We also demonstrate significant long-term prediction improvements over general RNN and LSTM architectures on a range of simulated environments with nonlinear dynamics, as well on real-world climate and traffic data.", "keywords": "RNNs;time series forecasting;nonlinear dynamics;tensor-train", "primary_area": "", "supplementary_material": "", "author": "Rose Yu;Stephan Zheng;Anima Anandkumar;Yisong Yue", "authorids": "rose@caltech.edu;stephan@caltech.edu;anima@caltech.edu;yyue@caltech.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nyu2018longterm,\ntitle={Long-term Forecasting using Tensor-Train {RNN}s},\nauthor={Rose Yu and Stephan Zheng and Anima Anandkumar and Yisong Yue},\nyear={2018},\nurl={https://openreview.net/forum?id=HJJ0w--0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HJJ0w--0W", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;3;4", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 172, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=271773734757196354&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "title": "Initialization matters: Orthogonal Predictive State Recurrent Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/109", "id": "HJJ23bW0b", "author_site": "Krzysztof Choromanski, Carlton Downey, Byron Boots", "tldr": "Improving Predictive State Recurrent Neural Networks via Orthogonal Random Features", "abstract": "Learning to predict complex time-series data is a fundamental challenge in a range of disciplines including Machine Learning, Robotics, and Natural Language Processing. Predictive State Recurrent Neural Networks (PSRNNs) (Downey et al.) are a state-of-the-art approach for modeling time-series data which combine the benefits of probabilistic filters and Recurrent Neural Networks into a single model. PSRNNs leverage the concept of Hilbert Space Embeddings of distributions (Smola et al.) to embed predictive states into a Reproducing Kernel Hilbert Space, then estimate, predict, and update these embedded states using Kernel Bayes Rule. Practical implementations of PSRNNs are made possible by the machinery of Random Features, where input features are mapped into a new space where dot products approximate the kernel well. Unfortunately PSRNNs often require a large number of RFs to obtain good results, resulting in large models which are slow to execute and slow to train. Orthogonal Random Features (ORFs) (Choromanski et al.) is an improvement on RFs which has been shown to decrease the number of RFs required for pointwise kernel approximation. Unfortunately, it is not clear that ORFs can be applied to PSRNNs, as PSRNNs rely on Kernel Ridge Regression as a core component of their learning algorithm, and the theoretical guarantees of ORF do not apply in this setting. In this paper, we extend the theory of ORFs to Kernel Ridge Regression and show that ORFs can be used to obtain Orthogonal PSRNNs (OPSRNNs), which are smaller and faster than PSRNNs. In particular, we show that OPSRNN models clearly outperform LSTMs and furthermore, can achieve accuracy similar to PSRNNs with an order of magnitude smaller number of features needed.", "keywords": "recurrent neural networks;orthogonal random features;predictive state representations", "primary_area": "", "supplementary_material": "", "author": "Krzysztof Choromanski;Carlton Downey;Byron Boots", "authorids": "kchoro@google.com;cmdowney@cs.cmu.edu;bboots@cc.gatech.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nchoromanski2018initialization,\ntitle={Initialization matters: Orthogonal Predictive State Recurrent Neural Networks},\nauthor={Krzysztof Choromanski and Carlton Downey and Byron Boots},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HJJ23bW0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "4;7;8", "confidence": "5;2;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": -0.5765566601970552, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6668969337949062074&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HJJ23bW0b", "pdf": "https://openreview.net/pdf?id=HJJ23bW0b", "email": ";;", "author_num": 3 }, { "id": "HJMN-xWC-", "title": "Learning Parsimonious Deep Feed-forward Networks", "track": "main", "status": "Reject", "tldr": "An unsupervised structure learning method for Parsimonious Deep Feed-forward Networks.", "abstract": "Convolutional neural networks and recurrent neural networks are designed with network structures well suited to the nature of spacial and sequential data respectively. However, the structure of standard feed-forward neural networks (FNNs) is simply a stack of fully connected layers, regardless of the feature correlations in data. In addition, the number of layers and the number of neurons are manually tuned on validation data, which is time-consuming and may lead to suboptimal networks. In this paper, we propose an unsupervised structure learning method for learning parsimonious deep FNNs. Our method determines the number of layers, the number of neurons at each layer, and the sparse connectivity between adjacent layers automatically from data. The resulting models are called Backbone-Skippath Neural Networks (BSNNs). Experiments on 17 tasks show that, in comparison with FNNs, BSNNs can achieve better or comparable classification performance with much fewer parameters. The interpretability of BSNNs is also shown to be better than that of FNNs.", "keywords": "Parsimonious Deep Feed-forward Networks;structure learning;classification;overfitting;fewer parameters;high interpretability", "primary_area": "", "supplementary_material": "", "author": "Zhourong Chen;Xiaopeng Li;Nevin L. Zhang", "authorids": "zchenbb@cse.ust.hk;xlibo@cse.ust.hk;lzhang@cse.ust.hk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nchen2018learning,\ntitle={Learning Parsimonious Deep Feed-forward Networks},\nauthor={Zhourong Chen and Xiaopeng Li and Nevin L. Zhang},\nyear={2018},\nurl={https://openreview.net/forum?id=HJMN-xWC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=HJMN-xWC-", "pdf_size": 0, "rating": "4;5;5", "confidence": "2;2;5", "rating_avg": 4.666666666666667, "confidence_avg": 3.0, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.49999999999999994, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8NR9NjfRZecJ:scholar.google.com/&scioq=Learning+Parsimonious+Deep+Feed-forward+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HJNGGmZ0Z", "title": "What is image captioning made of?", "track": "main", "status": "Reject", "tldr": "This paper presents an empirical analysis on the role of different types of image representations and probes the properties of these representations for the task of image captioning.", "abstract": "We hypothesize that end-to-end neural image captioning systems work seemingly well because they exploit and learn \u2018distributional similarity\u2019 in a multimodal feature space, by mapping a test image to similar training images in this space and generating a caption from the same space. To validate our hypothesis, we focus on the \u2018image\u2019 side of image captioning, and vary the input image representation but keep the RNN text generation model of a CNN-RNN constant. We propose a sparse bag-of-objects vector as an interpretable representation to investigate our distributional similarity hypothesis. We found that image captioning models (i) are capable of separating structure from noisy input representations; (ii) experience virtually no significant performance loss when a high dimensional representation is compressed to a lower dimensional space; (iii) cluster images with similar visual and linguistic information together; (iv) are heavily reliant on test sets with a similar distribution as the training set; (v) repeatedly generate the same captions by matching images and \u2018retrieving\u2019 a caption in the joint visual-textual space. Our experiments all point to one fact: that our distributional similarity hypothesis holds. We conclude that, regardless of the image representation, image captioning systems seem to match images and generate captions in a learned joint image-text semantic subspace.\n", "keywords": "image captioning;representation learning;interpretability;rnn;multimodal;vision to language", "primary_area": "", "supplementary_material": "", "author": "Pranava Madhyastha;Josiah Wang;Lucia Specia", "authorids": "p.madhyastha@sheffield.ac.uk;j.k.wang@sheffield.ac.uk;l.specia@sheffield.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmadhyastha2018what,\ntitle={What is image captioning made of?},\nauthor={Pranava Madhyastha and Josiah Wang and Lucia Specia},\nyear={2018},\nurl={https://openreview.net/forum?id=HJNGGmZ0Z},\n}", "github": "[![github](/images/github_icon.svg) anonymousiclr/HJNGGmZ0Z](https://github.com/anonymousiclr/HJNGGmZ0Z)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJNGGmZ0Z", "pdf_size": 0, "rating": "4;4;4", "confidence": "5;4;5", "rating_avg": 4.0, "confidence_avg": 4.666666666666667, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "HJNMYceCW", "title": "Residual Loss Prediction: Reinforcement Learning With No Incremental Feedback", "track": "main", "status": "Poster", "tldr": "We present a novel algorithm for solving reinforcement learning and bandit structured prediction problems with very sparse loss feedback.", "abstract": "We consider reinforcement learning and bandit structured prediction problems with very sparse loss feedback: only at the end of an episode. We introduce a novel algorithm, RESIDUAL LOSS PREDICTION (RESLOPE), that solves such problems by automatically learning an internal representation of a denser reward function. RESLOPE operates as a reduction to contextual bandits, using its learned loss representation to solve the credit assignment problem, and a contextual bandit oracle to trade-off exploration and exploitation. RESLOPE enjoys a no-regret reduction-style theoretical guarantee and outperforms state of the art reinforcement learning algorithms in both MDP environments and bandit structured prediction settings.", "keywords": "Reinforcement Learning;Structured Prediction;Contextual Bandits;Learning Reduction", "primary_area": "", "supplementary_material": "", "author": "Hal Daum\u00e9 III;John Langford;Amr Sharaf", "authorids": "hal@umiacs.umd.edu;jl@hunch.net;amr@cs.umd.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ndaum\u00e92018residual,\ntitle={{RESIDUAL} {LOSS} {PREDICTION}: {REINFORCEMENT} {LEARNING} {WITH} {NO} {INCREMENTAL} {FEEDBACK}},\nauthor={Hal Daum\u00e9 III and John Langford and Paul Mineiro and Amr Sharaf},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HJNMYceCW},\n}", "github": "[![github](/images/github_icon.svg) hal3/reslope](https://github.com/hal3/reslope)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJNMYceCW", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;2;5", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 16, "authors#_avg": 3, "corr_rating_confidence": -0.18898223650461363, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11251280234880641754&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "HJOQ7MgAW", "title": "Long Short-Term Memory as a Dynamically Computed Element-wise Weighted Sum", "track": "main", "status": "Reject", "tldr": "Gates do all the heavy lifting in LSTMs by computing element-wise weighted sums, and removing the internal simple RNN does not degrade model performance.", "abstract": "Long short-term memory networks (LSTMs) were introduced to combat vanishing gradients in simple recurrent neural networks (S-RNNs) by augmenting them with additive recurrent connections controlled by gates. We present an alternate view to explain the success of LSTMs: the gates themselves are powerful recurrent models that provide more representational power than previously appreciated. We do this by showing that the LSTM's gates can be decoupled from the embedded S-RNN, producing a restricted class of RNNs where the main recurrence computes an element-wise weighted sum of context-independent functions of the inputs. Experiments on a range of challenging NLP problems demonstrate that the simplified gate-based models work substantially better than S-RNNs, and often just as well as the original LSTMs, strongly suggesting that the gates are doing much more in practice than just alleviating vanishing gradients.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Omer Levy;Kenton Lee;Nicholas FitzGerald;Luke Zettlemoyer", "authorids": "omerlevy@gmail.com;kentonl@cs.washington.edu;nfitz@cs.washington.edu;lsz@cs.washington.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlevy2018long,\ntitle={Long Short-Term Memory as a Dynamically Computed Element-wise Weighted Sum},\nauthor={Omer Levy and Kenton Lee and Nicholas FitzGerald and Luke Zettlemoyer},\nyear={2018},\nurl={https://openreview.net/forum?id=HJOQ7MgAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJOQ7MgAW", "pdf_size": 0, "rating": "5;6;7", "confidence": "5;4;3", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 17, "authors#_avg": 4, "corr_rating_confidence": -1.0, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18174076341933090388&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "HJPSN3gRW", "title": "Learning to navigate by distilling visual information and natural language instructions", "track": "main", "status": "Reject", "tldr": "Attention based architecture for language grounding via reinforcement learning in a new customizable 2D grid environment ", "abstract": "In this work, we focus on the problem of grounding language by training an agent\nto follow a set of natural language instructions and navigate to a target object\nin a 2D grid environment. The agent receives visual information through raw\npixels and a natural language instruction telling what task needs to be achieved.\nOther than these two sources of information, our model does not have any prior\ninformation of both the visual and textual modalities and is end-to-end trainable.\nWe develop an attention mechanism for multi-modal fusion of visual and textual\nmodalities that allows the agent to learn to complete the navigation tasks and also\nachieve language grounding. Our experimental results show that our attention\nmechanism outperforms the existing multi-modal fusion mechanisms proposed in\norder to solve the above mentioned navigation task. We demonstrate through the\nvisualization of attention weights that our model learns to correlate attributes of\nthe object referred in the instruction with visual representations and also show\nthat the learnt textual representations are semantically meaningful as they follow\nvector arithmetic and are also consistent enough to induce translation between instructions\nin different natural languages. We also show that our model generalizes\neffectively to unseen scenarios and exhibit zero-shot generalization capabilities.\nIn order to simulate the above described challenges, we introduce a new 2D environment\nfor an agent to jointly learn visual and textual modalities", "keywords": "Deep reinforcement learning;Computer Vision;Multi-modal fusion;Language Grounding", "primary_area": "", "supplementary_material": "", "author": "Abhishek Sinha;Akilesh B;Mausoom Sarkar;Balaji Krishnamurthy", "authorids": "abhsinha@adobe.com;akb@adobe.com;msarkar@adobe.com;kbalaji@adobe.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsinha2018learning,\ntitle={Learning to navigate by distilling visual information and natural language instructions},\nauthor={Abhishek Sinha and Akilesh B and Mausoom Sarkar and Balaji Krishnamurthy},\nyear={2018},\nurl={https://openreview.net/forum?id=HJPSN3gRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJPSN3gRW", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;4;3", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 20, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:IGESWY4b_GUJ:scholar.google.com/&scioq=Learning+to+navigate+by+distilling+visual+information+and+natural+language+instructions&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HJRV1ZZAW", "title": "FAST READING COMPREHENSION WITH CONVNETS", "track": "main", "status": "Reject", "tldr": "", "abstract": "State-of-the-art deep reading comprehension models are dominated by recurrent\nneural nets. Their sequential nature is a natural fit for language, but it also precludes\nparallelization within an instances and often becomes the bottleneck for\ndeploying such models to latency critical scenarios. This is particularly problematic\nfor longer texts. Here we present a convolutional architecture as an alternative\nto these recurrent architectures. Using simple dilated convolutional units in place\nof recurrent ones, we achieve results comparable to the state of the art on two\nquestion answering tasks, while at the same time achieving up to two orders of\nmagnitude speedups for question answering.", "keywords": "reading comprehension;question answering;CNN;ConvNet;Inference", "primary_area": "", "supplementary_material": "", "author": "Felix Wu;Ni Lao;John Blitzer;Guandao Yang;Kilian Weinberger", "authorids": "fw245@cornell.edu;nlao@google.com;blitzer@google.com;gy46@cornell.edu;kqw4@cornell.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nwu2018fast,\ntitle={{FAST} {READING} {COMPREHENSION} {WITH} {CONVNETS}},\nauthor={Felix Wu and Ni Lao and John Blitzer and Guandao Yang and Kilian Weinberger},\nyear={2018},\nurl={https://openreview.net/forum?id=HJRV1ZZAW},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=HJRV1ZZAW)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJRV1ZZAW", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 5, "corr_rating_confidence": -0.944911182523068, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17228094434416183929&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "HJSA_e1AW", "title": "Normalized Direction-preserving Adam", "track": "main", "status": "Reject", "tldr": "A tailored version of Adam for training DNNs, which bridges the generalization gap between Adam and SGD.", "abstract": "Optimization algorithms for training deep models not only affects the convergence rate and stability of the training process, but are also highly related to the generalization performance of trained models. While adaptive algorithms, such as Adam and RMSprop, have shown better optimization performance than stochastic gradient descent (SGD) in many scenarios, they often lead to worse generalization performance than SGD, when used for training deep neural networks (DNNs). In this work, we identify two problems regarding the direction and step size for updating the weight vectors of hidden units, which may degrade the generalization performance of Adam. As a solution, we propose the normalized direction-preserving Adam (ND-Adam) algorithm, which controls the update direction and step size more precisely, and thus bridges the generalization gap between Adam and SGD. Following a similar rationale, we further improve the generalization performance in classification tasks by regularizing the softmax logits. By bridging the gap between SGD and Adam, we also shed some light on why certain optimization algorithms generalize better than others.", "keywords": "optimization;generalization;Adam;SGD", "primary_area": "", "supplementary_material": "", "author": "Zijun Zhang;Lin Ma;Zongpeng Li;Chuan Wu", "authorids": "zijun.zhang@ucalgary.ca;linmawhu@gmail.com;zongpeng@ucalgary.ca;cwu@cs.hku.hk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhang2018normalized,\ntitle={Normalized Direction-preserving Adam},\nauthor={Zijun Zhang and Lin Ma and Zongpeng Li and Chuan Wu},\nyear={2018},\nurl={https://openreview.net/forum?id=HJSA_e1AW},\n}", "github": "[![github](/images/github_icon.svg) zj10/ND-Adam](https://github.com/zj10/ND-Adam)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HJSA_e1AW", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;5", "rating_avg": 4.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 15, "authors#_avg": 4, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12948102037303062202&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "HJUOHGWRb", "title": "Contextual Explanation Networks", "track": "main", "status": "Reject", "tldr": "A class of networks that generate simple models on the fly (called explanations) that act as a regularizer and enable consistent model diagnostics and interpretability.", "abstract": "We introduce contextual explanation networks (CENs)---a class of models that learn to predict by generating and leveraging intermediate explanations. CENs are deep networks that generate parameters for context-specific probabilistic graphical models which are further used for prediction and play the role of explanations. Contrary to the existing post-hoc model-explanation tools, CENs learn to predict and to explain jointly. Our approach offers two major advantages: (i) for each prediction, valid instance-specific explanations are generated with no computational overhead and (ii) prediction via explanation acts as a regularization and boosts performance in low-resource settings. We prove that local approximations to the decision boundary of our networks are consistent with the generated explanations. Our results on image and text classification and survival analysis tasks demonstrate that CENs are competitive with the state-of-the-art while offering additional insights behind each prediction, valuable for decision support.", "keywords": "interpretability;regularization;deep learning;graphical models;model diagnostics;survival analysis", "primary_area": "", "supplementary_material": "", "author": "Maruan Al-Shedivat;Avinava Dubey;Eric P. Xing", "authorids": "alshedivat@cs.cmu.edu;akdubey@cs.cmu.edu;epxing@cs.cmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nal-shedivat2018contextual,\ntitle={Contextual Explanation Networks},\nauthor={Maruan Al-Shedivat and Avinava Dubey and Eric P. Xing},\nyear={2018},\nurl={https://openreview.net/forum?id=HJUOHGWRb},\n}", "github": "[![github](/images/github_icon.svg) alshedivat/cen](https://github.com/alshedivat/cen)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJUOHGWRb", "pdf_size": 0, "rating": "6;6;6", "confidence": "2;5;3", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 112, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11129916790884451349&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13 }, { "id": "HJWGdbbCW", "title": "Reinforcement and Imitation Learning for Diverse Visuomotor Skills", "track": "main", "status": "Reject", "tldr": "combine reinforcement learning and imitation learning to solve complex robot manipulation tasks from pixels", "abstract": "We propose a general deep reinforcement learning method and apply it to robot manipulation tasks. Our approach leverages demonstration data to assist a reinforcement learning agent in learning to solve a wide range of tasks, mainly previously unsolved. We train visuomotor policies end-to-end to learn a direct mapping from RGB camera inputs to joint velocities. Our experiments indicate that our reinforcement and imitation approach can solve contact-rich robot manipulation tasks that neither the state-of-the-art reinforcement nor imitation learning method can solve alone. We also illustrate that these policies achieved zero-shot sim2real transfer by training with large visual and dynamics variations.", "keywords": "reinforcement learning;imitation learning;robotics;visuomotor skills", "primary_area": "", "supplementary_material": "", "author": "Yuke Zhu;Ziyu Wang;Josh Merel;Andrei Rusu;Tom Erez;Serkan Cabi;Saran Tunyasuvunakool;J\u00e1nos Kram\u00e1r;Raia Hadsell;Nando de Freitas;Nicolas Heess", "authorids": "yukez@cs.stanford.edu;ziyu@google.com;jsmerel@google.com;andreirusu@google.com;etom@google.com;cabi@google.com;stunya@google.com;janosk@google.com;raia@google.com;nandodefreitas@google.com;heess@google.com", "gender": ";;;;;;;;;;", "homepage": ";;;;;;;;;;", "dblp": ";;;;;;;;;;", "google_scholar": ";;;;;;;;;;", "orcid": ";;;;;;;;;;", "linkedin": ";;;;;;;;;;", "or_profile": ";;;;;;;;;;", "aff": ";;;;;;;;;;", "aff_domain": ";;;;;;;;;;", "position": ";;;;;;;;;;", "bibtex": "@misc{\nzhu2018reinforcement,\ntitle={Reinforcement and Imitation Learning for Diverse Visuomotor Skills},\nauthor={Yuke Zhu and Ziyu Wang and Josh Merel and Andrei Rusu and Tom Erez and Serkan Cabi and Saran Tunyasuvunakool and J\u00e1nos Kram\u00e1r and Raia Hadsell and Nando de Freitas and Nicolas Heess},\nyear={2018},\nurl={https://openreview.net/forum?id=HJWGdbbCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HJWGdbbCW", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;4;5", "rating_avg": 4.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 10, "authors#_avg": 11, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 394, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16353226391702260751&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12 }, { "title": "Matrix capsules with EM routing", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/87", "id": "HJWLfGWRb", "author_site": "Geoffrey E Hinton, Sara Sabour, Nicholas Frosst", "tldr": "Capsule networks with learned pose matrices and EM routing improves state of the art classification on smallNORB, improves generalizability to new view points, and white box adversarial robustness. ", "abstract": "A capsule is a group of neurons whose outputs represent different properties of the same entity. Each layer in a capsule network contains many capsules. We describe a version of capsules in which each capsule has a logistic unit to represent the presence of an entity and a 4x4 matrix which could learn to represent the relationship between that entity and the viewer (the pose). A capsule in one layer votes for the pose matrix of many different capsules in the layer above by multiplying its own pose matrix by trainable viewpoint-invariant transformation matrices that could learn to represent part-whole relationships. Each of these votes is weighted by an assignment coefficient. These coefficients are iteratively updated for each image using the Expectation-Maximization algorithm such that the output of each capsule is routed to a capsule in the layer above that receives a cluster of similar votes. The transformation matrices are trained discriminatively by backpropagating through the unrolled iterations of EM between each pair of adjacent capsule layers. On the smallNORB benchmark, capsules reduce the number of test errors by 45\\% compared to the state-of-the-art. Capsules also show far more resistance to white box adversarial attacks than our baseline convolutional neural network.", "keywords": "Computer Vision;Deep Learning;Dynamic routing", "primary_area": "", "supplementary_material": "", "author": "Geoffrey E Hinton;Sara Sabour;Nicholas Frosst", "authorids": "geoffhinton@google.com;sasabour@google.com;frosst@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ne2018matrix,\ntitle={Matrix capsules with {EM} routing},\nauthor={Geoffrey E Hinton and Sara Sabour and Nicholas Frosst},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HJWLfGWRb},\n}", "github": "[![github](/images/github_icon.svg) google-research/google-research](https://github.com/google-research/google-research/tree/master/capsule_em) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=HJWLfGWRb)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "4;6;7", "confidence": "2;3;3", "rating_avg": 5.666666666666667, "confidence_avg": 2.6666666666666665, "replies_avg": 46, "authors#_avg": 3, "corr_rating_confidence": 0.9449111825230683, "gs_citation": 1356, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16246220969925140156&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=HJWLfGWRb", "pdf": "https://openreview.net/pdf?id=HJWLfGWRb", "email": ";;", "author_num": 3 }, { "id": "HJWpQCa7z", "title": "Deep Net Triage: Assessing The Criticality of Network Layers by Structural Compression", "track": "main", "status": "Withdraw", "tldr": "We seek to understand learned representations in compressed networks via an experimental regime we call deep net triage", "abstract": "Deep network compression seeks to reduce the number of parameters in the network while maintaining a certain level of performance. Deep network distillation seeks to train a smaller network that matches soft-max performance of a larger network. While both regimes have led to impressive performance for their respective goals, neither provide insight into the importance of a given layer in the original model, which is useful if we are to improve our understanding of these highly parameterized models. In this paper, we present the concept of deep net triage, which individually assesses small blocks of convolution layers to understand their collective contribution to the overall performance, which we call \\emph{criticality}. We call it triage because we assess this criticality by answering the question: what is the impact to the health of the overall network if we compress a block of layers into a single layer.\nWe propose a suite of triage methods and compare them on problem spaces of varying complexity. We ultimately show that, across these problem spaces, deep net triage is able to indicate the of relative importance of different layers. Surprisingly, our local structural compression technique also leads to an improvement in overall accuracy when the final model is fine-tuned globally.", "keywords": "Deep Compression;Deep Learning;Parent-Teacher Networks", "primary_area": "", "supplementary_material": "", "author": "Theodore S. Nowak;Jason J. Corso", "authorids": "tsnowak@umich.edu;jjcorso@umich.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJWpQCa7z", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;3;4", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 3, "authors#_avg": 2, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2110329454466760743&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HJWu8i18G", "title": "title", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "abstract", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "authors", "authorids": "", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=HJWu8i18G", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 0, "authors#_avg": 1, "corr_rating_confidence": 0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "HJXOfZ-AZ", "title": "When and where do feed-forward neural networks learn localist representations?", "track": "main", "status": "Reject", "tldr": "Local codes have been found in feed-forward neural networks", "abstract": "According to parallel distributed processing (PDP) theory in psychology, neural networks (NN) learn distributed rather than interpretable localist representations. This view has been held so strongly that few researchers have analysed single units to determine if this assumption is correct. However, recent results from psychology, neuroscience and computer science have shown the occasional existence of local codes emerging in artificial and biological neural networks. In this paper, we undertake the first systematic survey of when local codes emerge in a feed-forward neural network, using generated input and output data with known qualities. We find that the number of local codes that emerge from a NN follows a well-defined distribution across the number of hidden layer neurons, with a peak determined by the size of input data, number of examples presented and the sparsity of input data. Using a 1-hot output code drastically decreases the number of local codes on the hidden layer. The number of emergent local codes increases with the percentage of dropout applied to the hidden layer, suggesting that the localist encoding may offer a resilience to noisy networks. This data suggests that localist coding can emerge from feed-forward PDP networks and suggests some of the conditions that may lead to interpretable localist representations in the cortex. The findings highlight how local codes should not be dismissed out of hand.", "keywords": "localist;pdp;neural network;representation;psychology;cognition", "primary_area": "", "supplementary_material": "", "author": "Ella M. Gale;Nicolas Martin;Jeffrey Bowers", "authorids": "eg16993@bristol.ac.uk;nm13850@bristol.ac.uk;j.bowers@bristol.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nm.2018when,\ntitle={When and where do feed-forward neural networks learn localist representations?},\nauthor={Ella M. Gale and Nicolas Martin and Jeffrey Bowers},\nyear={2018},\nurl={https://openreview.net/forum?id=HJXOfZ-AZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HJXOfZ-AZ", "pdf_size": 0, "rating": "3;3;5", "confidence": "5;3;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14817768576951252443&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "HJXyS7bRb", "title": "A Goal-oriented Neural Conversation Model by Self-Play", "track": "main", "status": "Reject", "tldr": "A Goal-oriented Neural Conversation Model by Self-Play", "abstract": "Building chatbots that can accomplish goals such as booking a flight ticket is an unsolved problem in natural language understanding. Much progress has been made to build conversation models using techniques such as sequence2sequence modeling. One challenge in applying such techniques to building goal-oriented conversation models is that maximum likelihood-based models are not optimized toward accomplishing goals. Recently, many methods have been proposed to address this issue by optimizing a reward that contains task status or outcome. However, adding the reward optimization on the fly usually provides little guidance for language construction and the conversation model soon becomes decoupled from the language model. In this paper, we propose a new setting in goal-oriented dialogue system to tighten the gap between these two aspects by enforcing model level information isolation on individual models between two agents. Language construction now becomes an important part in reward optimization since it is the only way information can be exchanged. We experimented our models using self-play and results showed that our method not only beat the baseline sequence2sequence model in rewards but can also generate human-readable meaningful conversations of comparable quality. ", "keywords": "conversation model;seq2seq;self-play;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Wei Wei;Quoc V. Le;Andrew M. Dai;Li-Jia Li", "authorids": "wewei@google.com;adai@google.com;qvl@google.com;lijiali@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nwei2018a,\ntitle={A Goal-oriented Neural Conversation Model by Self-Play},\nauthor={Wei Wei and Quoc V. Le and Andrew M. Dai and Li-Jia Li},\nyear={2018},\nurl={https://openreview.net/forum?id=HJXyS7bRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJXyS7bRb", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;3;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": -0.7559289460184545, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10579410006537875848&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HJYQLb-RW", "title": "On the limitations of first order approximation in GAN dynamics", "track": "main", "status": "Workshop", "tldr": "To understand GAN training, we define simple GAN dynamics, and show quantitative differences between optimal and first order updates in this model.", "abstract": "Generative Adversarial Networks (GANs) have been proposed as an approach to learning generative models. While GANs have demonstrated promising performance on multiple vision tasks, their learning dynamics are not yet well understood, neither in theory nor in practice. In particular, the work in this domain has been focused so far only on understanding the properties of the stationary solutions that this dynamics might converge to, and of the behavior of that dynamics in this solutions\u2019 immediate neighborhood.\n\nTo address this issue, in this work we take a first step towards a principled study of the GAN dynamics itself. To this end, we propose a model that, on one hand, exhibits several of the common problematic convergence behaviors (e.g., vanishing gradient, mode collapse, diverging or oscillatory behavior), but on the other hand, is sufficiently simple to enable rigorous convergence analysis.\n\nThis methodology enables us to exhibit an interesting phenomena: a GAN with an optimal discriminator provably converges, while guiding the GAN training using only a first order approximation of the discriminator leads to unstable GAN dynamics and mode collapse. This suggests that such usage of the first order approximation of the discriminator, which is a de-facto standard in all the existing GAN dynamics, might be one of the factors that makes GAN training so challenging in practice. Additionally, our convergence result constitutes the first rigorous analysis of a dynamics of a concrete parametric GAN.", "keywords": "GANs;first order dynamics;convergence;mode collapse", "primary_area": "", "supplementary_material": "", "author": "Jerry Li;Aleksander Madry;John Peebles;Ludwig Schmidt", "authorids": "jerryzli@mit.edu;madry@mit.edu;jpeebles@mit.edu;ludwigs@mit.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nli2018on,\ntitle={On the limitations of first order approximation in {GAN} dynamics},\nauthor={Jerry Li and Aleksander Madry and John Peebles and Ludwig Schmidt},\nyear={2018},\nurl={https://openreview.net/forum?id=HJYQLb-RW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJYQLb-RW", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;3;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": -0.7559289460184545, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12853395852540879423&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "HJYoqzbC-", "title": "A comparison of second-order methods for deep convolutional neural networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Despite many second-order methods have been proposed to train neural networks, most of the results were done on smaller single layer fully connected networks, so we still cannot conclude whether it's useful in training deep convolutional networks. In this study, we conduct extensive experiments to answer the question \"whether second-order method is useful for deep learning?\". In our analysis, we find out although currently second-order methods are too slow to be applied in practice, it can reduce training loss in fewer number of iterations compared with SGD. In addition, we have the following interesting findings: (1) When using a large batch size, inexact-Newton methods will converge much faster than SGD. Therefore inexact-Newton method could be a better choice in distributed training of deep networks. (2) Quasi-newton methods are competitive with SGD even when using ReLu activation function (which has no curvature) on residual networks. However, current methods are too sensitive to parameters and not easy to tune for different settings. Therefore, quasi-newton methods with more self-adjusting mechanisms might be more useful than SGD in training deeper networks. \n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Patrick H. Chen;Cho-jui Hsieh", "authorids": "phpchen@ucdavis.edu;chohsieh@ucdavis.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nh.2018a,\ntitle={A comparison of second-order methods for deep convolutional neural networks},\nauthor={Patrick H. Chen and Cho-jui Hsieh},\nyear={2018},\nurl={https://openreview.net/forum?id=HJYoqzbC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJYoqzbC-", "pdf_size": 0, "rating": "3;5;6", "confidence": "4;5;3", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": -0.3273268353539886, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15888474139459107608&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HJZiRkZC-", "title": "Byte-Level Recursive Convolutional Auto-Encoder for Text", "track": "main", "status": "Reject", "tldr": "", "abstract": "This article proposes to auto-encode text at byte-level using convolutional networks with a recursive architecture. The motivation is to explore whether it is possible to have scalable and homogeneous text generation at byte-level in a non-sequential fashion through the simple task of auto-encoding. We show that non-sequential text generation from a fixed-length representation is not only possible, but also achieved much better auto-encoding results than recurrent networks. The proposed model is a multi-stage deep convolutional encoder-decoder framework using residual connections, containing up to 160 parameterized layers. Each encoder or decoder contains a shared group of modules that consists of either pooling or upsampling layers, making the network recursive in terms of abstraction levels in representation. Results for 6 large-scale paragraph datasets are reported, in 3 languages including Arabic, Chinese and English. Analyses are conducted to study several properties of the proposed model.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiang Zhang;Yann LeCun", "authorids": "xiang@cs.nyu.edu;yann@cs.nyu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nzhang2018bytelevel,\ntitle={Byte-Level Recursive Convolutional Auto-Encoder for Text},\nauthor={Xiang Zhang and Yann LeCun},\nyear={2018},\nurl={https://openreview.net/forum?id=HJZiRkZC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJZiRkZC-", "pdf_size": 0, "rating": "5;5;7", "confidence": "3;5;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7594797594694509499&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "HJ_X8GupW", "title": "Multi-label Learning for Large Text Corpora using Latent Variable Model with Provable Gurantees", "track": "main", "status": "Reject", "tldr": "", "abstract": "Here we study the problem of learning labels for large text corpora where each document can be assigned a variable number of labels. The problem is trivial when the label dimensionality is small and can be easily solved by a series of one-vs-all classifiers. However, as the label dimensionality increases, the parameter space of such one-vs-all classifiers becomes extremely large and outstrips the memory. Here we propose a latent variable model to reduce the size of the parameter space, but still efficiently learn the labels. We learn the model using spectral learning and show how to extract the parameters using only three passes through the training dataset. Further, we analyse the sample complexity of our model using PAC learning theory and then demonstrate the performance of our algorithm on several benchmark datasets in comparison with existing algorithms.\n", "keywords": "Spectral Method;Multi-label Learning;Tensor Factorisation", "primary_area": "", "supplementary_material": "", "author": "Sayantan Dasgupta", "authorids": "sayandg@umich.edu", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\ndasgupta2018multilabel,\ntitle={Multi-label Learning for Large Text Corpora using Latent Variable Model with Provable Gurantees},\nauthor={Sayantan Dasgupta},\nyear={2018},\nurl={https://openreview.net/forum?id=HJ_X8GupW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJ_X8GupW", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;5;5", "rating_avg": 3.6666666666666665, "confidence_avg": 4.666666666666667, "replies_avg": 4, "authors#_avg": 1, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Y7uf7EDMjkQJ:scholar.google.com/&scioq=Multi-label+Learning+for+Large+Text+Corpora+using+Latent+Variable+Model+with+Provable+Gurantees&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "SpectralNet: Spectral Clustering using Deep Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/290", "id": "HJ_aoCyRZ", "author_site": "Uri Shaham, Kelly Stanton, Henry Li, Ronen Basri, Boaz Nadler, Yuval Kluger", "tldr": "Unsupervised spectral clustering using deep neural networks", "abstract": "Spectral clustering is a leading and popular technique in unsupervised data analysis. Two of its major limitations are scalability and generalization of the spectral embedding (i.e., out-of-sample-extension). In this paper we introduce a deep learning approach to spectral clustering that overcomes the above shortcomings. Our network, which we call SpectralNet, learns a map that embeds input data points into the eigenspace of their associated graph Laplacian matrix and subsequently clusters them. We train SpectralNet using a procedure that involves constrained stochastic optimization. Stochastic optimization allows it to scale to large datasets, while the constraints, which are implemented using a special purpose output layer, allow us to keep the network output orthogonal. Moreover, the map learned by SpectralNet naturally generalizes the spectral embedding to unseen data points. To further improve the quality of the clustering, we replace the standard pairwise Gaussian affinities with affinities leaned from unlabeled data using a Siamese network. Additional improvement can be achieved by applying the network to code representations produced, e.g., by standard autoencoders. Our end-to-end learning procedure is fully unsupervised. In addition, we apply VC dimension theory to derive a lower bound on the size of SpectralNet. State-of-the-art clustering results are reported for both the MNIST and Reuters datasets.\n", "keywords": "unsupervised learning;spectral clustering;siamese networks", "primary_area": "", "supplementary_material": "", "author": "Uri Shaham;Kelly Stanton;Henry Li;Ronen Basri;Boaz Nadler;Yuval Kluger", "authorids": "uri.shaham@yale.edu;kelly.stanton@yale.edu;henry.li@yale.edu;ronen.basri@gmail.com;boaz.nadler@gmail.com;yuval.kluger@yale.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nshaham2018spectralnet,\ntitle={SpectralNet: Spectral Clustering using Deep Neural Networks},\nauthor={Uri Shaham and Kelly Stanton and Henry Li and Ronen Basri and Boaz Nadler and Yuval Kluger},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HJ_aoCyRZ},\n}", "github": "[![github](/images/github_icon.svg) kstant0725/SpectralNet](https://github.com/kstant0725/SpectralNet) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=HJ_aoCyRZ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;3;5", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 14, "authors#_avg": 6, "corr_rating_confidence": 0.3273268353539886, "gs_citation": 387, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4554119900285680620&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15, "openreview": "https://openreview.net/forum?id=HJ_aoCyRZ", "pdf": "https://openreview.net/pdf?id=HJ_aoCyRZ", "email": ";;;;;", "author_num": 6 }, { "id": "HJaDJZ-0W", "title": "Block-Sparse Recurrent Neural Networks", "track": "main", "status": "Reject", "tldr": "We show the RNNs can be pruned to induce block sparsity which improves speedup for sparse operations on existing hardware", "abstract": "Recurrent Neural Networks (RNNs) are used in state-of-the-art models in domains such as speech recognition, machine translation, and language modelling. Sparsity is a technique to reduce compute and memory requirements of deep learning models. Sparse RNNs are easier to deploy on devices and high-end server processors. Even though sparse operations need less compute and memory relative to their dense counterparts, the speed-up observed by using sparse operations is less than expected on different hardware platforms. In order to address this issue, we investigate two different approaches to induce block sparsity in RNNs: pruning blocks of weights in a layer and using group lasso regularization with pruning to create blocks of weights with zeros. Using these techniques, we can create block-sparse RNNs with sparsity ranging from 80% to 90% with a small loss in accuracy. This technique allows us to reduce the model size by roughly 10x. Additionally, we can prune a larger dense network to recover this loss in accuracy while maintaining high block sparsity and reducing the overall parameter count. Our technique works with a variety of block sizes up to 32x32. Block-sparse RNNs eliminate overheads related to data storage and irregular memory accesses while increasing hardware efficiency compared to unstructured sparsity.\n", "keywords": "Pruning;block sparsity;structured sparsity;Recurrent Neural Networks;Speech Recognition", "primary_area": "", "supplementary_material": "", "author": "Sharan Narang;Eric Undersander;Gregory Diamos", "authorids": "sharan@baidu.com;undersandereric@baidu.com;gdiamos@baidu.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nnarang2018blocksparse,\ntitle={Block-Sparse Recurrent Neural Networks},\nauthor={Sharan Narang and Eric Undersander and Gregory Diamos},\nyear={2018},\nurl={https://openreview.net/forum?id=HJaDJZ-0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJaDJZ-0W", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;3;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 3, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 167, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14378681490896004349&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "title": "Meta-Learning for Semi-Supervised Few-Shot Classification", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/88", "id": "HJcSzz-CZ", "author_site": "Mengye Ren, Eleni Triantafillou, Sachin Ravi, Jake Snell, Kevin Swersky, Joshua B Tenenbaum, Hugo Larochelle, Richard Zemel", "tldr": "We propose novel extensions of Prototypical Networks that are augmented with the ability to use unlabeled examples when producing prototypes.", "abstract": "In few-shot classification, we are interested in learning algorithms that train a classifier from only a handful of labeled examples. Recent progress in few-shot classification has featured meta-learning, in which a parameterized model for a learning algorithm is defined and trained on episodes representing different classification problems, each with a small labeled training set and its corresponding test set. In this work, we advance this few-shot classification paradigm towards a scenario where unlabeled examples are also available within each episode. We consider two situations: one where all unlabeled examples are assumed to belong to the same set of classes as the labeled examples of the episode, as well as the more challenging situation where examples from other distractor classes are also provided. To address this paradigm, we propose novel extensions of Prototypical Networks (Snell et al., 2017) that are augmented with the ability to use unlabeled examples when producing prototypes. These models are trained in an end-to-end way on episodes, to learn to leverage the unlabeled examples successfully. We evaluate these methods on versions of the Omniglot and miniImageNet benchmarks, adapted to this new framework augmented with unlabeled examples. We also propose a new split of ImageNet, consisting of a large set of classes, with a hierarchical structure. Our experiments confirm that our Prototypical Networks can learn to improve their predictions due to unlabeled examples, much like a semi-supervised algorithm would.", "keywords": "Few-shot learning;semi-supervised learning;meta-learning", "primary_area": "", "supplementary_material": "", "author": "Mengye Ren;Eleni Triantafillou;Sachin Ravi;Jake Snell;Kevin Swersky;Joshua B. Tenenbaum;Hugo Larochelle;Richard S. Zemel", "authorids": "mren@cs.toronto.edu;eleni@cs.toronto.edu;sachinr@princeton.edu;jsnell@cs.toronto.edu;kswersky@google.com;jbt@mit.edu;hugolarochelle@google.com;zemel@cs.toronto.edu", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\nren2018metalearning,\ntitle={Meta-Learning for Semi-Supervised Few-Shot Classification},\nauthor={Mengye Ren and Sachin Ravi and Eleni Triantafillou and Jake Snell and Kevin Swersky and Josh B. Tenenbaum and Hugo Larochelle and Richard S. Zemel},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HJcSzz-CZ},\n}", "github": "[![github](/images/github_icon.svg) renmengye/few-shot-ssl-public](https://github.com/renmengye/few-shot-ssl-public) + [![Papers with Code](/images/pwc_icon.svg) 8 community implementations](https://paperswithcode.com/paper/?openreview=HJcSzz-CZ)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;5", "rating_avg": 6.0, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 1791, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=798380540199769906&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "openreview": "https://openreview.net/forum?id=HJcSzz-CZ", "pdf": "https://openreview.net/pdf?id=HJcSzz-CZ", "email": ";;;;;;;", "author_num": 8 }, { "id": "HJcjQTJ0W", "title": "PrivyNet: A Flexible Framework for Privacy-Preserving Deep Neural Network Training", "track": "main", "status": "Reject", "tldr": "To enable cloud-based DNN training while protecting the data privacy simultaneously, we propose to leverage the intermediate data representations, which is achieved by splitting the DNNs and deploying them separately onto local platforms and the cloud.", "abstract": "Massive data exist among user local platforms that usually cannot support deep neural network (DNN) training due to computation and storage resource constraints. Cloud-based training schemes provide beneficial services but suffer from potential privacy risks due to excessive user data collection. To enable cloud-based DNN training while protecting the data privacy simultaneously, we propose to leverage the intermediate representations of the data, which is achieved by splitting the DNNs and deploying them separately onto local platforms and the cloud. The local neural network (NN) is used to generate the feature representations. To avoid local training and protect data privacy, the local NN is derived from pre-trained NNs. The cloud NN is then trained based on the extracted intermediate representations for the target learning task. We validate the idea of DNN splitting by characterizing the dependency of privacy loss and classification accuracy on the local NN topology for a convolutional NN (CNN) based image classification task. Based on the characterization, we further propose PrivyNet to determine the local NN topology, which optimizes the accuracy of the target learning task under the constraints on privacy loss, local computation, and storage. The efficiency and effectiveness of PrivyNet are demonstrated with CIFAR-10 dataset.", "keywords": "Privacy-preserving deep learning;Neural network training", "primary_area": "", "supplementary_material": "", "author": "Meng Li;Liangzhen Lai;Naveen Suda;Vikas Chandra;David Z. Pan", "authorids": "meng_li@utexas.edu;liangzhen.lai@arm.com;naveen.suda@arm.com;vikas.chandra@arm.com;dpan@ece.utexas.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nli2018privynet,\ntitle={PrivyNet: A Flexible Framework for Privacy-Preserving Deep Neural Network Training},\nauthor={Meng Li and Liangzhen Lai and Naveen Suda and Vikas Chandra and David Z. Pan},\nyear={2018},\nurl={https://openreview.net/forum?id=HJcjQTJ0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HJcjQTJ0W", "pdf_size": 0, "rating": "3;5;6", "confidence": "3;3;5", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.7559289460184546, "gs_citation": 55, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4082883201730729265&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "HJdXGy1RW", "title": "CrescendoNet: A Simple Deep Convolutional Neural Network with Ensemble Behavior", "track": "main", "status": "Reject", "tldr": "We introduce CrescendoNet, a deep CNN architecture by stacking simple building blocks without residual connections.", "abstract": "We introduce a new deep convolutional neural network, CrescendoNet, by stacking simple building blocks without residual connections. Each Crescendo block contains independent convolution paths with increased depths. The numbers of convolution layers and parameters are only increased linearly in Crescendo blocks. In experiments, CrescendoNet with only 15 layers outperforms almost all networks without residual connections on benchmark datasets, CIFAR10, CIFAR100, and SVHN. Given sufficient amount of data as in SVHN dataset, CrescendoNet with 15 layers and 4.1M parameters can match the performance of DenseNet-BC with 250 layers and 15.3M parameters. CrescendoNet provides a new way to construct high performance deep convolutional neural networks without residual connections. Moreover, through investigating the behavior and performance of subnetworks in CrescendoNet, we note that the high performance of CrescendoNet may come from its implicit ensemble behavior, which differs from the FractalNet that is also a deep convolutional neural network without residual connections. Furthermore, the independence between paths in CrescendoNet allows us to introduce a new path-wise training procedure, which can reduce the memory needed for training.", "keywords": "CNN;ensemble;image recognition", "primary_area": "", "supplementary_material": "", "author": "Xiang Zhang;Nishant Vishwamitra;Hongxin Hu;Feng Luo", "authorids": "xzhang7@clemson.edu;nvishwa@clemson.edu;luofeng@clemson.edu;hongxih@clemson.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhang2018crescendonet,\ntitle={CrescendoNet: A Simple Deep Convolutional Neural Network with Ensemble Behavior},\nauthor={Xiang Zhang and Nishant Vishwamitra and Hongxin Hu and Feng Luo},\nyear={2018},\nurl={https://openreview.net/forum?id=HJdXGy1RW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJdXGy1RW", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;5;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.666666666666667, "replies_avg": 19, "authors#_avg": 4, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3424574725266183191&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Learning to Teach", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/193", "id": "HJewuJWCZ", "author_site": "Yang Fan, Fei Tian, Tao Qin, Tie-Yan Liu", "tldr": "We propose and verify the effectiveness of learning to teach, a new framework to automatically guide machine learning process.", "abstract": "Teaching plays a very important role in our society, by spreading human knowledge and educating our next generations. A good teacher will select appropriate teaching materials, impact suitable methodologies, and set up targeted examinations, according to the learning behaviors of the students. In the field of artificial intelligence, however, one has not fully explored the role of teaching, and pays most attention to machine \\emph{learning}. In this paper, we argue that equal attention, if not more, should be paid to teaching, and furthermore, an optimization framework (instead of heuristics) should be used to obtain good teaching strategies. We call this approach ``learning to teach''. In the approach, two intelligent agents interact with each other: a student model (which corresponds to the learner in traditional machine learning algorithms), and a teacher model (which determines the appropriate data, loss function, and hypothesis space to facilitate the training of the student model). The teacher model leverages the feedback from the student model to optimize its own teaching strategies by means of reinforcement learning, so as to achieve teacher-student co-evolution. To demonstrate the practical value of our proposed approach, we take the training of deep neural networks (DNN) as an example, and show that by using the learning to teach techniques, we are able to use much less training data and fewer iterations to achieve almost the same accuracy for different kinds of DNN models (e.g., multi-layer perceptron, convolutional neural networks and recurrent neural networks) under various machine learning tasks (e.g., image classification and text understanding).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yang Fan;Fei Tian;Tao Qin;Xiang-Yang Li;Tie-Yan Liu", "authorids": ";;;;", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nfan2018learning,\ntitle={Learning to Teach},\nauthor={Yang Fan and Fei Tian and Tao Qin and Xiang-Yang Li and Tie-Yan Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HJewuJWCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;8;9", "confidence": "4;4;3", "rating_avg": 7.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 14, "authors#_avg": 5, "corr_rating_confidence": -0.6933752452815364, "gs_citation": 2782, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16063736443639851202&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "openreview": "https://openreview.net/forum?id=HJewuJWCZ", "pdf": "https://openreview.net/pdf?id=HJewuJWCZ", "email": ";;;;", "author_num": 5 }, { "id": "HJg1NTGZRZ", "title": "Bit-Regularized Optimization of Neural Nets", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present a novel regularization strategy for training neural networks which we call ``BitNet''. The parameters of neural networks are usually unconstrained and have a dynamic range dispersed over a real valued range. Our key idea is to control the expressive power of the network by dynamically quantizing the range and set of values that the parameters can take. We formulate this idea using a novel end-to-end approach that regularizes a typical classification loss function. Our regularizer is inspired by the Minimum Description Length (MDL) principle. For each layer of the network, our approach optimizes a translation and scaling factor along with integer-valued parameters. We empirically compare BitNet to an equivalent unregularized model on the MNIST and CIFAR-10 datasets. We show that BitNet converges faster to a superior quality solution. Additionally, the resulting model is significantly smaller in size due to the use of integer instead of floating-point parameters.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mohamed Amer;Aswin Raghavan;Graham W. Taylor;Sek Chai", "authorids": "mohamed.amer@sri.com;aswin.raghavan@sri.com;gwtaylor@uoguelph.ca;sek.chai@sri.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\namer2018bitregularized,\ntitle={Bit-Regularized Optimization of Neural Nets},\nauthor={Mohamed Amer and Aswin Raghavan and Graham W. Taylor and Sek Chai},\nyear={2018},\nurl={https://openreview.net/forum?id=HJg1NTGZRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJg1NTGZRZ", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;5;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lkWlZDG5DZ0J:scholar.google.com/&scioq=Bit-Regularized+Optimization+of+Neural+Nets&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Learning a neural response metric for retinal prosthesis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/206", "id": "HJhIM0xAW", "author_site": "Nishal Shah, Sasidhar Madugula, E.J. Chichilnisky, Yoram Singer, Jonathon Shlens", "tldr": "Using triplets to learn a metric for comparing neural responses and improve the performance of a prosthesis.", "abstract": "Retinal prostheses for treating incurable blindness are designed to electrically stimulate surviving retinal neurons, causing them to send artificial visual signals to the brain. However, electrical stimulation generally cannot precisely reproduce normal patterns of neural activity in the retina. Therefore, an electrical stimulus must be selected that produces a neural response as close as possible to the desired response. This requires a technique for computing a distance between the desired response and the achievable response that is meaningful in terms of the visual signal being conveyed. Here we propose a method to learn such a metric on neural responses, directly from recorded light responses of a population of retinal ganglion cells (RGCs) in the primate retina. The learned metric produces a measure of similarity of RGC population responses that accurately reflects the similarity of the visual input. Using data from electrical stimulation experiments, we demonstrate that this metric may improve the performance of a prosthesis.", "keywords": "Metric learning;Computational Neuroscience;Retina;Neural Prosthesis", "primary_area": "", "supplementary_material": "", "author": "Nishal P Shah;Sasidhar Madugula;EJ Chichilnisky;Yoram Singer;Jonathon Shlens", "authorids": "nishalps@stanford.edu;sasidhar@stanford.edu;ej@stanford.edu;singer@google.com;shlens@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\np2018learning,\ntitle={Learning a neural response metric for retinal prosthesis},\nauthor={Nishal P Shah and Sasidhar Madugula and EJ Chichilnisky and Yoram Singer and Jonathon Shlens},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HJhIM0xAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;3;4", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12862320072794108403&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=HJhIM0xAW", "pdf": "https://openreview.net/pdf?id=HJhIM0xAW", "email": ";;;;", "author_num": 5 }, { "id": "HJjePwx0-", "title": "Better Generalization by Efficient Trust Region Method", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we develop a trust region method for training deep neural networks. At each iteration, trust region method computes the search direction by solving a non-convex subproblem. Solving this subproblem is non-trivial---existing methods have only sub-linear convergence rate. In the first part, we show that a simple modification of gradient descent algorithm can converge to a global minimizer of the subproblem with an asymptotic linear convergence rate. Moreover, our method only requires Hessian-vector products, which can be computed efficiently by back-propagation in neural networks. In the second part, we apply our algorithm to train large-scale convolutional neural networks, such as VGG and MobileNets. Although trust region method is about 3 times slower than SGD in terms of running time, we observe it finds a model that has lower generalization (test) error than SGD, and this difference is even more significant in large batch training. \nWe conduct several interesting experiments to support our conjecture that the trust region method can avoid sharp local minimas.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xuanqing Liu;Jason D. Lee;Cho-Jui Hsieh", "authorids": "xqliu@ucdavis.edu;jasondlee88@gmail.com;chohsieh@ucdavis.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nliu2018better,\ntitle={Better Generalization by Efficient Trust Region Method},\nauthor={Xuanqing Liu and Jason D. Lee and Cho-Jui Hsieh},\nyear={2018},\nurl={https://openreview.net/forum?id=HJjePwx0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJjePwx0-", "pdf_size": 0, "rating": "5;6;6", "confidence": "2;3;5", "rating_avg": 5.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.7559289460184545, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:rH0VMd2w1YcJ:scholar.google.com/&scioq=Better+Generalization+by+Efficient+Trust+Region+Method&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HJjvxl-Cb", "title": "Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor", "track": "main", "status": "Workshop", "tldr": "We propose soft actor-critic, an off-policy actor-critic deep RL algorithm based on the maximum entropy reinforcement learning framework.", "abstract": "Model-free deep reinforcement learning (RL) algorithms have been demonstrated on a range of challenging decision making and control tasks. However, these methods typically suffer from two major challenges: very high sample complexity and brittle convergence properties, which necessitate meticulous hyperparameter tuning. Both of these challenges severely limit the applicability of such methods to complex, real-world domains. In this paper, we propose soft actor-critic, an off-policy actor-critic deep RL algorithm based on the maximum entropy reinforcement learning framework. In this framework, the actor aims to maximize expected reward while also maximizing entropy - that is, succeed at the task while acting as randomly as possible. Prior deep RL methods based on this framework have been formulated as either off-policy Q-learning, or on-policy policy gradient methods. By combining off-policy updates with a stable stochastic actor-critic formulation, our method achieves state-of-the-art performance on a range of continuous control benchmark tasks, outperforming prior on-policy and off-policy methods. Furthermore, we demonstrate that, in contrast to other off-policy algorithms, our approach is very stable, achieving very similar performance across different random seeds.", "keywords": "deep reinforcement learning;maximum entropy learning;stochastic actor-critic", "primary_area": "", "supplementary_material": "", "author": "Tuomas Haarnoja;Aurick Zhou;Pieter Abbeel;Sergey Levine", "authorids": "haarnoja@berkeley.edu;azhou42@berkeley.edu;pabbeel@cs.berkeley.edu;svlevine@eecs.berkeley.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nhaarnoja2018soft,\ntitle={Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor},\nauthor={Tuomas Haarnoja and Aurick Zhou and Pieter Abbeel and Sergey Levine},\nyear={2018},\nurl={https://openreview.net/forum?id=HJjvxl-Cb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJjvxl-Cb", "pdf_size": 0, "rating": "3;5;7", "confidence": "4;4;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 11310, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13282174879342015249&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "HJnQJXbC-", "title": "AMPNet: Asynchronous Model-Parallel Training for Dynamic Neural Networks", "track": "main", "status": "Reject", "tldr": "Using asynchronous gradient updates to accelerate dynamic neural network training", "abstract": "\nNew types of compute hardware in development and entering the market hold the promise of revolutionizing deep learning in a manner as profound as GPUs. However, existing software frameworks and training algorithms for deep learning have yet to evolve to fully leverage the capability of the new wave of silicon. In particular, models that exploit structured input via complex and instance-dependent control flow are difficult to accelerate using existing algorithms and hardware that typically rely on minibatching. We present an asynchronous model-parallel (AMP) training algorithm that is specifically motivated by training on networks of interconnected devices. Through an implementation on multi-core CPUs, we show that AMP training converges to the same accuracy as conventional synchronous training algorithms in a similar number of epochs, but utilizes the available hardware more efficiently, even for small minibatch sizes, resulting in shorter overall training times. Our framework opens the door for scaling up a new class of deep learning models that cannot be efficiently trained today.", "keywords": "asynchronous;neural network;deep learning;graph;tree;rnn", "primary_area": "", "supplementary_material": "", "author": "Alexander L. Gaunt;Matthew A. Johnson;Alan Lawrence;Maik Riechert;Daniel Tarlow;Ryota Tomioka;Dimitrios Vytiniotis;Sam Webster", "authorids": "algaunt@microsoft.com;matjoh@microsoft.com;allawr@microsoft.com;a-mariec@microsoft.com;dannytarlow@gmail.com;ryoto@microsoft.com;dimitris@microsoft.com;sweb@microsoft.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\nl.2018ampnet,\ntitle={{AMPN}et: Asynchronous Model-Parallel Training for Dynamic Neural Networks},\nauthor={Alexander L. Gaunt and Matthew A. Johnson and Alan Lawrence and Maik Riechert and Daniel Tarlow and Ryota Tomioka and Dimitrios Vytiniotis and Sam Webster},\nyear={2018},\nurl={https://openreview.net/forum?id=HJnQJXbC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJnQJXbC-", "pdf_size": 0, "rating": "4;6;6", "confidence": "5;4;5", "rating_avg": 5.333333333333333, "confidence_avg": 4.666666666666667, "replies_avg": 10, "authors#_avg": 8, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 67, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14061533502370197553&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "HJoYD6gC-", "title": "Word Mover's Embedding: From Word2Vec to Document Embedding", "track": "main", "status": "Active", "tldr": "A novel approach to building an unsupervised document (sentence) embeddings from pre-trainedword embeddings", "abstract": "Learning effective text representations is a key foundation for numerous machine learning and NLP applications. While the celebrated Word2Vec technique yields semantically rich word representations, it is less clear whether sentence or document representations should be built upon word representations or from scratch. Recent work has demonstrated that a distance measure between documents called \\emph{Word Mover's Distance} (WMD) that aligns semantically similar words, yields unprecedented KNN classification accuracy. However, WMD is very expensive to compute, and is harder to apply beyond simple KNN than feature embeddings. In this paper, we propose the \\emph{Word Mover's Embedding } (WME), a novel approach to building an unsupervised document (sentence) embedding from pre-trained word embeddings. Our technique extends the theory of \\emph{Random Features} to show convergence of the inner product between WMEs to a positive-definite kernel that can be interpreted as a soft version of (inverse) WMD. The proposed embedding is more efficient and flexible than WMD in many situations. As an example, WME with a simple linear classifier reduces the computational cost of WMD-based KNN \\emph{from cubic to linear} in document length and \\emph{from quadratic to linear} in number of samples, while simultaneously improving accuracy. In experiments on 9 benchmark text classification datasets and 22 textual similarity tasks the proposed technique consistently matches or outperforms state-of-the-art techniques, with significantly higher accuracy on problems of short length.", "keywords": "Word2Vec;Word Mover's Distance;Document Embedding", "primary_area": "", "supplementary_material": "", "author": "Anonymous", "authorids": "ICLR.cc/2018/Conference/Paper418/Authors", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@article{ \nanonymous2018word, \ntitle={Word Mover's Embedding: From Word2Vec to Document Embedding}, \nauthor={Anonymous}, \njournal={International Conference on Learning Representations}, \nyear={2018} \n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=HJoYD6gC-", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 0, "authors#_avg": 1, "corr_rating_confidence": 0, "gs_citation": 147, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16054486152486360838&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "HJqUtdOaZ", "title": "ENRICHMENT OF FEATURES FOR CLASSIFICATION USING AN OPTIMIZED LINEAR/NON-LINEAR COMBINATION OF INPUT FEATURES", "track": "main", "status": "Reject", "tldr": "A method for enriching and combining features to improve classification accuracy", "abstract": "Automatic classification of objects is one of the most important tasks in engineering\nand data mining applications. Although using more complex and advanced\nclassifiers can help to improve the accuracy of classification systems, it can be\ndone by analyzing data sets and their features for a particular problem. Feature\ncombination is the one which can improve the quality of the features. In this paper,\na structure similar to Feed-Forward Neural Network (FFNN) is used to generate an\noptimized linear or non-linear combination of features for classification. Genetic\nAlgorithm (GA) is applied to update weights and biases. Since nature of data sets\nand their features impact on the effectiveness of combination and classification\nsystem, linear and non-linear activation functions (or transfer function) are used\nto achieve more reliable system. Experiments of several UCI data sets and using\nminimum distance classifier as a simple classifier indicate that proposed linear and\nnon-linear intelligent FFNN-based feature combination can present more reliable\nand promising results. By using such a feature combination method, there is no\nneed to use more powerful and complex classifier anymore.", "keywords": "Classification;Feature Combination;Feature Mapping;Feed-Forward Neural Network;Genetic Algorithm;Linear Transfer Function;Non-Linear Transfer Function", "primary_area": "", "supplementary_material": "", "author": "Mehran Taghipour-Gorjikolaie;Seyyed Mohammad Razavi;Javad Sadri", "authorids": "mehran.tg.88@gmail.com;razavism@gmail.com;j_sadri@encs.concordia.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ntaghipour-gorjikolaie2018enrichment,\ntitle={{ENRICHMENT} {OF} {FEATURES} {FOR} {CLASSIFICATION} {USING} {AN} {OPTIMIZED} {LINEAR}/{NON}-{LINEAR} {COMBINATION} {OF} {INPUT} {FEATURES}},\nauthor={Mehran Taghipour-Gorjikolaie and Seyyed Mohammad Razavi and Javad Sadri},\nyear={2018},\nurl={https://openreview.net/forum?id=HJqUtdOaZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJqUtdOaZ", "pdf_size": 0, "rating": "1;2;3", "confidence": "5;3;4", "rating_avg": 2.0, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": -0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-w_VnvqlBwIJ:scholar.google.com/&scioq=ENRICHMENT+OF+FEATURES+FOR+CLASSIFICATION+USING+AN+OPTIMIZED+LINEAR/NON-LINEAR+COMBINATION+OF+INPUT+FEATURES&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HJr4QJ26W", "title": "Improving image generative models with human interactions", "track": "main", "status": "Reject", "tldr": "We describe how to improve an image generative model according to a slow- or difficult-to-evaluate objective, such as human feedback, which could have many applications, like making more aesthetic images.", "abstract": "GANs provide a framework for training generative models which mimic a data distribution. However, in many cases we wish to train a generative model to optimize some auxiliary objective function within the data it generates, such as making more aesthetically pleasing images. In some cases, these objective functions are difficult to evaluate, e.g. they may require human interaction. Here, we develop a system for efficiently training a GAN to increase a generic rate of positive user interactions, for example aesthetic ratings. To do this, we build a model of human behavior in the targeted domain from a relatively small set of interactions, and then use this behavioral model as an auxiliary loss function to improve the generative model. As a proof of concept, we demonstrate that this system is successful at improving positive interaction rates simulated from a variety of objectives, and characterize s", "keywords": "human in the loop;GANs;generative adversarial networks;image generative models;computer vision", "primary_area": "", "supplementary_material": "", "author": "Andrew Kyle Lampinen;David So;Douglas Eck;Fred Bertsch", "authorids": "lampinen@stanford.edu;davidso@google.com;deck@google.com;fredbertsch@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nkyle2018improving,\ntitle={Improving image generative models with human interactions},\nauthor={Andrew Kyle Lampinen and David So and Douglas Eck and Fred Bertsch},\nyear={2018},\nurl={https://openreview.net/forum?id=HJr4QJ26W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJr4QJ26W", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;5;3", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10215988006565323308&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "HJrJpzZRZ", "title": "Self-Supervised Learning of Object Motion Through Adversarial Video Prediction", "track": "main", "status": "Reject", "tldr": "", "abstract": "Can we build models that automatically learn about object motion from raw, unlabeled videos? In this paper, we study the problem of multi-step video prediction, where the goal is to predict a sequence of future frames conditioned on a short context. We focus specifically on two aspects of video prediction: accurately modeling object motion, and producing naturalistic image predictions. Our model is based on a flow-based generator network with a discriminator used to improve prediction quality. The implicit flow in the generator can be examined to determine its accuracy, and the predicted images can be evaluated for image quality. We argue that these two metrics are critical for understanding whether the model has effectively learned object motion, and propose a novel evaluation benchmark based on ground truth object flow. Our network achieves state-of-the-art results in terms of both the realism of the predicted images, as determined by human judges, and the accuracy of the predicted flow. Videos and full results can be viewed on the supplementary website: \\url{https://sites.google.com/site/omvideoprediction}.", "keywords": "adversarial;video prediction;flow", "primary_area": "", "supplementary_material": "", "author": "Alex X. Lee;Frederik Ebert;Richard Zhang;Chelsea Finn;Pieter Abbeel;Sergey Levine", "authorids": ";;;;;", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nx.2018selfsupervised,\ntitle={Self-Supervised Learning of Object Motion Through Adversarial Video Prediction},\nauthor={Alex X. Lee and Frederik Ebert and Richard Zhang and Chelsea Finn and Pieter Abbeel and Sergey Levine},\nyear={2018},\nurl={https://openreview.net/forum?id=HJrJpzZRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=HJrJpzZRZ", "pdf_size": 0, "rating": "3;3;3;7", "confidence": "5;4;5;5", "rating_avg": 4.0, "confidence_avg": 4.75, "replies_avg": 7, "authors#_avg": 6, "corr_rating_confidence": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-zzjR8t6Q3QJ:scholar.google.com/&scioq=Self-Supervised+Learning+of+Object+Motion+Through+Adversarial+Video+Prediction&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "i-RevNet: Deep Invertible Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/103", "id": "HJsjkMb0Z", "author_site": "Joern-Henrik Jacobsen, Arnold W Smeulders, Edouard Oyallon", "tldr": "", "abstract": "It is widely believed that the success of deep convolutional networks is based on progressively discarding uninformative variability about the input with respect to the problem at hand. This is supported empirically by the difficulty of recovering images from their hidden representations, in most commonly used network architectures. In this paper we show via a one-to-one mapping that this loss of information is not a necessary condition to learn representations that generalize well on complicated problems, such as ImageNet. Via a cascade of homeomorphic layers, we build the $i$-RevNet, a network that can be fully inverted up to the final projection onto the classes, i.e. no information is discarded. Building an invertible architecture is difficult, for one, because the local inversion is ill-conditioned, we overcome this by providing an explicit inverse. \nAn analysis of i-RevNet\u2019s learned representations suggests an alternative explanation for the success of deep networks by a progressive contraction and linear separation with depth. To shed light on the nature of the model learned by the $i$-RevNet we reconstruct linear interpolations between natural image representations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "J\u00f6rn-Henrik Jacobsen;Arnold W.M. Smeulders;Edouard Oyallon", "authorids": "joern.jacobsen@bethgelab.org;a.w.m.smeulders@uva.nl;edouard.oyallon@ens.fr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\njacobsen2018irevnet,\ntitle={i-RevNet: Deep Invertible Networks},\nauthor={J\u00f6rn-Henrik Jacobsen and Arnold W.M. Smeulders and Edouard Oyallon},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HJsjkMb0Z},\n}", "github": "[![github](/images/github_icon.svg) jhjacobsen/pytorch-i-revnet](https://github.com/jhjacobsen/pytorch-i-revnet) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=HJsjkMb0Z)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "8;8;9", "confidence": "4;4;4", "rating_avg": 8.333333333333334, "confidence_avg": 4.0, "replies_avg": 20, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 426, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14608880224467079528&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=HJsjkMb0Z", "pdf": "https://openreview.net/pdf?id=HJsjkMb0Z", "email": ";;", "author_num": 3 }, { "id": "HJsk5-Z0W", "title": "Structured Deep Factorization Machine: Towards General-Purpose Architectures", "track": "main", "status": "Reject", "tldr": "Scalable general-purpose factorization algorithm-- also helps to circumvent cold start problem.", "abstract": "In spite of their great success, traditional factorization algorithms typically do not support features (e.g., Matrix Factorization), or their complexity scales quadratically with the number of features (e.g, Factorization Machine). On the other hand, neural methods allow large feature sets, but are often designed for a specific application. We propose novel deep factorization methods that allow efficient and flexible feature representation. For example, we enable describing items with natural language with complexity linear to the vocabulary size\u2014this enables prediction for unseen items and avoids the cold start problem. We show that our architecture can generalize some previously published single-purpose neural architectures. Our experiments suggest improved training times and accuracy compared to shallow methods.", "keywords": "factorization;general-purpose methods", "primary_area": "", "supplementary_material": "", "author": "Jos\u00e9 P. Gonz\u00e1lez-Brenes;Ralph Edezhath", "authorids": "jgonzalez@chegg.com;redezhath@chegg.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\np.2018structured,\ntitle={Structured Deep Factorization Machine: Towards General-Purpose Architectures},\nauthor={Jos\u00e9 P. Gonz\u00e1lez-Brenes and Ralph Edezhath},\nyear={2018},\nurl={https://openreview.net/forum?id=HJsk5-Z0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HJsk5-Z0W", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;5;3", "rating_avg": 3.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 2, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:O-_3krpGWwcJ:scholar.google.com/&scioq=Structured+Deep+Factorization+Machine:+Towards+General-Purpose+Architectures&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/323", "id": "HJtEm4p6Z", "author_site": "Wei Ping, Kainan Peng, Andrew Gibiansky, Sercan Arik, Ajay Kannan, SHARAN NARANG, Jonathan Raiman, John Miller", "tldr": "", "abstract": "We present Deep Voice 3, a fully-convolutional attention-based neural text-to-speech (TTS) system. Deep Voice 3 matches state-of-the-art neural speech synthesis systems in naturalness while training an order of magnitude faster. We scale Deep Voice 3 to dataset sizes unprecedented for TTS, training on more than eight hundred hours of audio from over two thousand speakers. In addition, we identify common error modes of attention-based speech synthesis networks, demonstrate how to mitigate them, and compare several different waveform synthesis methods. We also describe how to scale inference to ten million queries per day on a single GPU server.", "keywords": "2000-Speaker Neural TTS;Monotonic Attention;Speech Synthesis", "primary_area": "", "supplementary_material": "", "author": "Wei Ping;Kainan Peng;Andrew Gibiansky;Sercan O. Arik;Ajay Kannan;Sharan Narang;Jonathan Raiman;John Miller", "authorids": "pingwei01@baidu.com;pengkainan@baidu.com;gibianskyandrew@baidu.com;sercanarik@baidu.com;kannanajay@baidu.com;sharan@baidu.com;raiman@openai.com;miller_john@berkeley.edu", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\nping2018deep,\ntitle={Deep Voice 3: 2000-Speaker Neural Text-to-Speech},\nauthor={Wei Ping and Kainan Peng and Andrew Gibiansky and Sercan O. Arik and Ajay Kannan and Sharan Narang and Jonathan Raiman and John Miller},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HJtEm4p6Z},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 7 community implementations](https://paperswithcode.com/paper/?openreview=HJtEm4p6Z)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;5;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 586, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1828409622662260131&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HJtEm4p6Z", "pdf": "https://openreview.net/pdf?id=HJtEm4p6Z", "email": ";;;;;;;", "author_num": 8 }, { "id": "HJtPtdqQG", "title": "Dynamically Learning the Learning Rates: Online Hyperparameter Optimization", "track": "main", "status": "Withdraw", "tldr": "Bayesian optimization based online hyperparameter optimization.", "abstract": "Hyperparameter tuning is arguably the most important ingredient for obtaining state of art performance in deep networks. We focus on hyperparameters that are related to the optimization algorithm, e.g. learning rates, which have a large impact on the training speed and the resulting accuracy. Typically, fixed learning rate schedules are employed during training. We propose Hyperdyn a dynamic hyperparameter optimization method that selects new learning rates on the fly at the end of each epoch. Our explore-exploit framework combines Bayesian optimization (BO) with a rejection strategy, based on a simple probabilistic wait and watch test. We obtain state of art accuracy results on CIFAR and Imagenet datasets, but with significantly faster training, when compared with the best manually tuned networks.", "keywords": "hyperparameters;optimization;SGD;Adam;Bayesian", "primary_area": "", "supplementary_material": "", "author": "Tuhin Sarkar;Anima Anandkumar;Leo Dirac", "authorids": "tsarkar@mit.edu;animakumar@gmail.com;leodirac@amazon.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJtPtdqQG", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 3, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:y5461WJRXhEJ:scholar.google.com/&scioq=Dynamically+Learning+the+Learning+Rates:+Online+Hyperparameter+Optimization&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "An image representation based convolutional network for DNA classification", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/229", "id": "HJvvRoe0W", "author_site": "Bojian Yin, Marleen Balvert, Davide Zambrano, Alexander Schoenhuth, Sander Bohte", "tldr": "A method to transform DNA sequences into 2D images using space-filling Hilbert Curves to enhance the strengths of CNNs", "abstract": "The folding structure of the DNA molecule combined with helper molecules, also referred to as the chromatin, is highly relevant for the functional properties of DNA. The chromatin structure is largely determined by the underlying primary DNA sequence, though the interaction is not yet fully understood. In this paper we develop a convolutional neural network that takes an image-representation of primary DNA sequence as its input, and predicts key determinants of chromatin structure. The method is developed such that it is capable of detecting interactions between distal elements in the DNA sequence, which are known to be highly relevant. Our experiments show that the method outperforms several existing methods both in terms of prediction accuracy and training time.", "keywords": "DNA sequences;Hilbert curves;Convolutional neural networks;chromatin structure", "primary_area": "", "supplementary_material": "", "author": "Bojian Yin;Marleen Balvert;Davide Zambrano;Alexander Schoenhuth;Sander Bohte", "authorids": "yinbojian93@gmail.com;m.balvert@cwi.nl;d.zambrano@cwi.nl;a.schoenhuth@cwi.nl;s.m.bohte@cwi.nl", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nyin2018an,\ntitle={An image representation based convolutional network for {DNA} classification},\nauthor={Bojian Yin and Marleen Balvert and Davide Zambrano and Alexander Schoenhuth and Sander Bohte},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HJvvRoe0W},\n}", "github": "[![github](/images/github_icon.svg) Bojian/Hilbert-CNN](https://github.com/Bojian/Hilbert-CNN)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;7;7", "confidence": "5;3;5", "rating_avg": 7.0, "confidence_avg": 4.333333333333333, "replies_avg": 15, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4721638019752473074&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=HJvvRoe0W", "pdf": "https://openreview.net/pdf?id=HJvvRoe0W", "email": ";;;;", "author_num": 5 }, { "id": "HJw8fAgA-", "title": "Learning Dynamic State Abstractions for Model-Based Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "A key challenge in model-based reinforcement learning (RL) is to synthesize computationally efficient and accurate environment models. We show that carefully designed models that learn predictive and compact state representations, also called state-space models, substantially reduce the computational costs for predicting outcomes of sequences of actions. Extensive experiments establish that state-space models accurately capture the dynamics of Atari games from the Arcade Learning Environment (ALE) from raw pixels. Furthermore, RL agents that use Monte-Carlo rollouts of these models as features for decision making outperform strong model-free baselines on the game MS_PACMAN, demonstrating the benefits of planning using learned dynamic state abstractions.", "keywords": "generative models;probabilistic modelling;reinforcement learning;state-space models;planning", "primary_area": "", "supplementary_material": "", "author": "Lars Buesing;Theophane Weber;Sebastien Racaniere;S. M. Ali Eslami;Danilo Rezende;David Reichert;Fabio Viola;Frederic Besse;Karol Gregor;Demis Hassabis;Daan Wierstra", "authorids": "lbuesing@google.com;theophane@google.com;;;;;;;;demishassabis@google.com;", "gender": ";;;;;;;;;;", "homepage": ";;;;;;;;;;", "dblp": ";;;;;;;;;;", "google_scholar": ";;;;;;;;;;", "orcid": ";;;;;;;;;;", "linkedin": ";;;;;;;;;;", "or_profile": ";;;;;;;;;;", "aff": ";;;;;;;;;;", "aff_domain": ";;;;;;;;;;", "position": ";;;;;;;;;;", "bibtex": "@misc{\nbuesing2018learning,\ntitle={Learning Dynamic State Abstractions for Model-Based Reinforcement Learning},\nauthor={Lars Buesing and Theophane Weber and Sebastien Racaniere and S. M. Ali Eslami and Danilo Rezende and David Reichert and Fabio Viola and Frederic Besse and Karol Gregor and Demis Hassabis and Daan Wierstra},\nyear={2018},\nurl={https://openreview.net/forum?id=HJw8fAgA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJw8fAgA-", "pdf_size": 0, "rating": "5;6;8", "confidence": "4;4;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 11, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7IuXO02bk5oJ:scholar.google.com/&scioq=Learning+Dynamic+State+Abstractions+for+Model-Based+Reinforcement+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Efficient Sparse-Winograd Convolutional Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/297", "id": "HJzgZ3JCW", "author_site": "Xingyu Liu, Jeff Pool, song han, Bill Dally", "tldr": "Prune and ReLU in Winograd domain for efficient convolutional neural network", "abstract": "Convolutional Neural Networks (CNNs) are computationally intensive, which limits their application on mobile devices. Their energy is dominated by the number of multiplies needed to perform the convolutions. Winograd\u2019s minimal filtering algorithm (Lavin, 2015) and network pruning (Han et al., 2015) can reduce the operation count, but these two methods cannot be straightforwardly combined \u2014 applying the Winograd transform fills in the sparsity in both the weights and the activations. We propose two modifications to Winograd-based CNNs to enable these methods to exploit sparsity. First, we move the ReLU operation into the Winograd domain to increase the sparsity of the transformed activations. Second, we prune the weights in the Winograd domain to exploit static weight sparsity. For models on CIFAR-10, CIFAR-100 and ImageNet datasets, our method reduces the number of multiplications by 10.4x, 6.8x and 10.8x respectively with loss of accuracy less than 0.1%, outperforming previous baselines by 2.0x-3.0x. We also show that moving ReLU to the Winograd domain allows more aggressive pruning.", "keywords": "deep learning;convolutional neural network;pruning", "primary_area": "", "supplementary_material": "", "author": "Xingyu Liu;Jeff Pool;Song Han;William J. Dally", "authorids": "xyl@stanford.edu;jpool@nvidia.com;songhan@stanford.edu;dally@stanford.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nliu2018efficient,\ntitle={Efficient Sparse-Winograd Convolutional Neural Networks},\nauthor={Xingyu Liu and Jeff Pool and Song Han and William J. Dally},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HJzgZ3JCW},\n}", "github": "[![github](/images/github_icon.svg) xingyul/Sparse-Winograd-CNN](https://github.com/xingyul/Sparse-Winograd-CNN)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "3;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 172, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5437414522331578688&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HJzgZ3JCW", "pdf": "https://openreview.net/pdf?id=HJzgZ3JCW", "email": ";;;", "author_num": 4 }, { "id": "Hk-FlMbAZ", "title": "The Manifold Assumption and Defenses Against Adversarial Perturbations", "track": "main", "status": "Reject", "tldr": "Defending against adversarial perturbations of neural networks from manifold assumption ", "abstract": "In the adversarial-perturbation problem of neural networks, an adversary starts with a neural network model $F$ and a point $\\bfx$ that $F$ classifies correctly, and applies a \\emph{small perturbation} to $\\bfx$ to produce another point $\\bfx'$ that $F$ classifies \\emph{incorrectly}. In this paper, we propose taking into account \\emph{the inherent confidence information} produced by models when studying adversarial perturbations, where a natural measure of ``confidence'' is \\|F(\\bfx)\\|_\\infty$ (i.e. how confident $F$ is about its prediction?). Motivated by a thought experiment based on the manifold assumption, we propose a ``goodness property'' of models which states that \\emph{confident regions of a good model should be well separated}. We give formalizations of this property and examine existing robust training objectives in view of them. Interestingly, we find that a recent objective by Madry et al. encourages training a model that satisfies well our formal version of the goodness property, but has a weak control of points that are wrong but with low confidence. However, if Madry et al.'s model is indeed a good solution to their objective, then good and bad points are now distinguishable and we can try to embed uncertain points back to the closest confident region to get (hopefully) correct predictions. We thus propose embedding objectives and algorithms, and perform an empirical study using this method. Our experimental results are encouraging: Madry et al.'s model wrapped with our embedding procedure achieves almost perfect success rate in defending against attacks that the base model fails on, while retaining good generalization behavior.\n", "keywords": "the manifold assumption;adversarial perturbation;neural networks", "primary_area": "", "supplementary_material": "", "author": "Xi Wu;Uyeong Jang;Lingjiao Chen;Somesh Jha", "authorids": "xiwu@cs.wisc.edu;wjang@cs.wisc.edu;lchen@cs.wisc.edu;jha@cs.wisc.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nwu2018the,\ntitle={The Manifold Assumption and Defenses Against Adversarial Perturbations},\nauthor={Xi Wu and Uyeong Jang and Lingjiao Chen and Somesh Jha},\nyear={2018},\nurl={https://openreview.net/forum?id=Hk-FlMbAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=Hk-FlMbAZ", "pdf_size": 0, "rating": "3;4;5", "confidence": "3;3;3", "rating_avg": 4.0, "confidence_avg": 3.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16297850499287055092&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "title": "Learning Sparse Latent Representations with the Deep Copula Information Bottleneck", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/156", "id": "Hk0wHx-RW", "author_site": "Aleksander Wieczorek, Mario Wieser, Damian Murezzan, Volker Roth", "tldr": "We apply the copula transformation to the Deep Information Bottleneck which leads to restored invariance properties and a disentangled latent space with superior predictive capabilities.", "abstract": "Deep latent variable models are powerful tools for representation learning. In this paper, we adopt the deep information bottleneck model, identify its shortcomings and propose a model that circumvents them. To this end, we apply a copula transformation which, by restoring the invariance properties of the information bottleneck method, leads to disentanglement of the features in the latent space. Building on that, we show how this transformation translates to sparsity of the latent space in the new model. We evaluate our method on artificial and real data.", "keywords": "Information Bottleneck;Deep Information Bottleneck;Deep Variational Information Bottleneck;Variational Autoencoder;Sparsity;Disentanglement;Interpretability;Copula;Mutual Information", "primary_area": "", "supplementary_material": "", "author": "Aleksander Wieczorek*;Mario Wieser*;Damian Murezzan;Volker Roth", "authorids": "aleksander.wieczorek@unibas.ch;mario.wieser@unibas.ch;d.murezzan@unibas.ch;volker.roth@unibas.ch", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nwieser*2018learning,\ntitle={Learning Sparse Latent Representations with the Deep Copula Information Bottleneck},\nauthor={Mario Wieser* and Aleksander Wieczorek* and Damian Murezzan and Volker Roth},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Hk0wHx-RW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer5;AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;6;6;6", "confidence": "4;1;3;3", "rating_avg": 5.75, "confidence_avg": 2.75, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": -0.6622661785325219, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3415979521364701041&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=Hk0wHx-RW", "pdf": "https://openreview.net/pdf?id=Hk0wHx-RW", "email": ";;;", "author_num": 4 }, { "id": "Hk2MHt-3-", "title": "Coupled Ensembles of Neural Networks", "track": "main", "status": "Reject", "tldr": "We show that splitting a neural network into parallel branches improves performance and that proper coupling of the branches improves performance even further.", "abstract": "We investigate in this paper the architecture of deep convolutional networks. Building on existing state of the art models, we propose a reconfiguration of the model parameters into several parallel branches at the global network level, with each branch being a standalone CNN. We show that this arrangement is an efficient way to significantly reduce the number of parameters while at the same time improving the performance. The use of branches brings an additional form of regularization. In addition to splitting the parameters into parallel branches, we propose a tighter coupling of these branches by averaging their log-probabilities. The tighter coupling favours the learning of better representations, even at the level of the individual branches, as compared to when each branch is trained independently. We refer to this branched architecture as \"coupled ensembles\". The approach is very generic and can be applied with almost any neural network architecture. With coupled ensembles of DenseNet-BC and parameter budget of 25M, we obtain error rates of 2.92%, 15.68% and 1.50% respectively on CIFAR-10, CIFAR-100 and SVHN tasks. For the same parameter budget, DenseNet-BC has an error rate of 3.46%, 17.18%, and 1.8% respectively. With ensembles of coupled ensembles, of DenseNet-BC networks, with 50M total parameters, we obtain error rates of 2.72%, 15.13% and 1.42% respectively on these tasks.", "keywords": "Ensemble learning;neural networks", "primary_area": "", "supplementary_material": "", "author": "Anuvabh Dutt;Denis Pellerin;Georges Qu\u00e9not", "authorids": "anuvabh.dutt@univ-grenoble-alpes.fr;denis.pellerin@gipsa-lab.grenoble-inp.fr;georges.quenot@imag.fr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ndutt2018coupled,\ntitle={Coupled Ensembles of Neural Networks},\nauthor={Anuvabh Dutt and Denis Pellerin and Georges Qu\u00e9not},\nyear={2018},\nurl={https://openreview.net/forum?id=Hk2MHt-3-},\n}", "github": "[![github](/images/github_icon.svg) vabh/coupled_ensembles](https://github.com/vabh/coupled_ensembles) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=Hk2MHt-3-)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Hk2MHt-3-", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2460187434682263615&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "title": "Multi-Scale Dense Networks for Resource Efficient Image Classification", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/278", "id": "Hk2aImxAb", "author_site": "Gao Huang, Danlu Chen, Tianhong Li, Felix Wu, Laurens van der Maaten, Kilian Weinberger", "tldr": "", "abstract": "In this paper we investigate image classification with computational resource limits at test time. Two such settings are: 1. anytime classification, where the network\u2019s prediction for a test example is progressively updated, facilitating the output of a prediction at any time; and 2. budgeted batch classification, where a fixed amount of computation is available to classify a set of examples that can be spent unevenly across \u201ceasier\u201d and \u201charder\u201d inputs. In contrast to most prior work, such as the popular Viola and Jones algorithm, our approach is based on convolutional neural networks. We train multiple classifiers with varying resource demands, which we adaptively apply during test time. To maximally re-use computation between the classifiers, we incorporate them as early-exits into a single deep convolutional neural network and inter-connect them with dense connectivity. To facilitate high quality classification early on, we use a two-dimensional multi-scale network architecture that maintains coarse and fine level features all-throughout the network. Experiments on three image-classification tasks demonstrate that our framework substantially improves the existing state-of-the-art in both settings.", "keywords": "efficient learning;budgeted learning;deep learning;image classification;convolutional networks", "primary_area": "", "supplementary_material": "", "author": "Gao Huang;Danlu Chen;Tianhong Li;Felix Wu;Laurens van der Maaten;Kilian Weinberger", "authorids": "gh349@cornell.edu;taineleau@gmail.com;lth14@mails.tsinghua.edu.cn;fw245@cornell.edu;lvdmaaten@fb.com;kqw4@cornell.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nhuang2018multiscale,\ntitle={Multi-Scale Dense Networks for Resource Efficient Image Classification},\nauthor={Gao Huang and Danlu Chen and Tianhong Li and Felix Wu and Laurens van der Maaten and Kilian Weinberger},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Hk2aImxAb},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 7 community implementations](https://paperswithcode.com/paper/?openreview=Hk2aImxAb)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "7;8;10", "confidence": "4;4;4", "rating_avg": 8.333333333333334, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 941, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8749554166283747056&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=Hk2aImxAb", "pdf": "https://openreview.net/pdf?id=Hk2aImxAb", "email": ";;;;;", "author_num": 6 }, { "title": "Imitation Learning from Visual Data with Multiple Intentions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/53", "id": "Hk3ddfWRW", "author_site": "Aviv Tamar, Khashayar Rohanimanesh, Yinlam Chow, Chris Vigorito, Ben Goodrich, Michael Kahane, Derik Pridmore", "tldr": "multi-modal imitation learning from unstructured demonstrations using stochastic neural network modeling intention. ", "abstract": "Recent advances in learning from demonstrations (LfD) with deep neural networks have enabled learning complex robot skills that involve high dimensional perception such as raw image inputs. \nLfD algorithms generally assume learning from single task demonstrations. In practice, however, it is more efficient for a teacher to demonstrate a multitude of tasks without careful task set up, labeling, and engineering. Unfortunately in such cases, traditional imitation learning techniques fail to represent the multi-modal nature of the data, and often result in sub-optimal behavior. In this paper we present an LfD approach for learning multiple modes of behavior from visual data. Our approach is based on a stochastic deep neural network (SNN), which represents the underlying intention in the demonstration as a stochastic activation in the network. We present an efficient algorithm for training SNNs, and for learning with vision inputs, we also propose an architecture that associates the intention with a stochastic attention module.\nWe demonstrate our method on real robot visual object reaching tasks, and show that\nit can reliably learn the multiple behavior modes in the demonstration data. Video results are available at https://vimeo.com/240212286/fd401241b9.", "keywords": "multi-modal imitation learning;deep learning;generative models;stochastic neural networks", "primary_area": "", "supplementary_material": "", "author": "Aviv Tamar;Khashayar Rohanimanesh;Yinlam Chow;Chris Vigorito;Ben Goodrich;Michael Kahane;Derik Pridmore", "authorids": "avivt@berkeley.edu;khash@osaro.com;yldick.chow@gmail.com;chris@osaro.com;ben@osaro.com;mk@osaro.com;derik@osaro.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\ntamar2018imitation,\ntitle={Imitation Learning from Visual Data with Multiple Intentions},\nauthor={Aviv Tamar and Khashayar Rohanimanesh and Yinlam Chow and Chris Vigorito and Ben Goodrich and Michael Kahane and Derik Pridmore},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Hk3ddfWRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "4;6;6", "confidence": "3;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1679668584294504646&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=Hk3ddfWRW", "pdf": "https://openreview.net/pdf?id=Hk3ddfWRW", "email": ";;;;;;", "author_num": 7 }, { "title": "Smooth Loss Functions for Deep Top-k Classification", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/170", "id": "Hk5elxbRW", "author_site": "Leonard Berrada, Andrew Zisserman, M. Pawan Kumar", "tldr": "Smooth Loss Function for Top-k Error Minimization", "abstract": "The top-$k$ error is a common measure of performance in machine learning and computer vision. In practice, top-$k$ classification is typically performed with deep neural networks trained with the cross-entropy loss. Theoretical results indeed suggest that cross-entropy is an optimal learning objective for such a task in the limit of infinite data. In the context of limited and noisy data however, the use of a loss function that is specifically designed for top-$k$ classification can bring significant improvements.\nOur empirical evidence suggests that the loss function must be smooth and have non-sparse gradients in order to work well with deep neural networks. Consequently, we introduce a family of smoothed loss functions that are suited to top-$k$ optimization via deep learning. The widely used cross-entropy is a special case of our family. Evaluating our smooth loss functions is computationally challenging: a na{\\\"i}ve algorithm would require $\\mathcal{O}(\\binom{n}{k})$ operations, where $n$ is the number of classes. Thanks to a connection to polynomial algebra and a divide-and-conquer approach, we provide an algorithm with a time complexity of $\\mathcal{O}(k n)$. Furthermore, we present a novel approximation to obtain fast and stable algorithms on GPUs with single floating point precision. We compare the performance of the cross-entropy loss and our margin-based losses in various regimes of noise and data size, for the predominant use case of $k=5$. Our investigation reveals that our loss is more robust to noise and overfitting than cross-entropy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Leonard Berrada;Andrew Zisserman;M. Pawan Kumar", "authorids": "lberrada@robots.ox.ac.uk;az@robots.ox.ac.uk;pawan@robots.ox.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nberrada2018smooth,\ntitle={Smooth Loss Functions for Deep Top-k Classification},\nauthor={Leonard Berrada and Andrew Zisserman and M. Pawan Kumar},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Hk5elxbRW},\n}", "github": "[![github](/images/github_icon.svg) oval-group/smooth-topk](https://github.com/oval-group/smooth-topk)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;8", "confidence": "5;4;4", "rating_avg": 7.0, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 147, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2261810241418874442&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 14, "openreview": "https://openreview.net/forum?id=Hk5elxbRW", "pdf": "https://openreview.net/pdf?id=Hk5elxbRW", "email": ";;", "author_num": 3 }, { "title": "Emergent Communication through Negotiation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/210", "id": "Hk6WhagRW", "author_site": "Kris Cao, Angeliki Lazaridou, Marc Lanctot, Joel Z Leibo, Karl Tuyls, Stephen Clark", "tldr": "We teach agents to negotiate using only reinforcement learning; selfish agents can do so, but only using a trustworthy communication channel, and prosocial agents can negotiate using cheap talk.", "abstract": "Multi-agent reinforcement learning offers a way to study how communication could emerge in communities of agents needing to solve specific problems. In this paper, we study the emergence of communication in the negotiation environment, a semi-cooperative model of agent interaction. We introduce two communication protocols - one grounded in the semantics of the game, and one which is a priori ungrounded. We show that self-interested agents can use the pre-grounded communication channel to negotiate fairly, but are unable to effectively use the ungrounded, cheap talk channel to do the same. However, prosocial agents do learn to use cheap talk to find an optimal negotiating strategy, suggesting that cooperation is necessary for language to emerge. We also study communication behaviour in a setting where one agent interacts with agents in a community with different levels of prosociality and show how agent identifiability can aid negotiation.", "keywords": "multi-agent learning;reinforcement learning;game theory;emergent communication", "primary_area": "", "supplementary_material": "", "author": "Kris Cao;Angeliki Lazaridou;Marc Lanctot;Joel Z Leibo;Karl Tuyls;Stephen Clark", "authorids": "kc391@cam.ac.uk;angeliki@google.com;lanctot@google.com;jzl@google.com;karltuyls@google.com;clarkstephen@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\ncao2018emergent,\ntitle={Emergent Communication through Negotiation},\nauthor={Kris Cao and Angeliki Lazaridou and Marc Lanctot and Joel Z Leibo and Karl Tuyls and Stephen Clark},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Hk6WhagRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;3;4", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 16, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 211, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8825869866742501521&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=Hk6WhagRW", "pdf": "https://openreview.net/pdf?id=Hk6WhagRW", "email": ";;;;;", "author_num": 6 }, { "title": "Certifying Some Distributional Robustness with Principled Adversarial Training", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/147", "id": "Hk6kPgZA-", "author_site": "Aman Sinha, Hong Namkoong, John Duchi", "tldr": "We provide a fast, principled adversarial training procedure with computational and statistical performance guarantees.", "abstract": "Neural networks are vulnerable to adversarial examples and researchers have proposed many heuristic attack and defense mechanisms. We address this problem through the principled lens of distributionally robust optimization, which guarantees performance under adversarial input perturbations. By considering a Lagrangian penalty formulation of perturbing the underlying data distribution in a Wasserstein ball, we provide a training procedure that augments model parameter updates with worst-case perturbations of training data. For smooth losses, our procedure provably achieves moderate levels of robustness with little computational or statistical cost relative to empirical risk minimization. Furthermore, our statistical guarantees allow us to efficiently certify robustness for the population loss. For imperceptible perturbations, our method matches or outperforms heuristic approaches.\n", "keywords": "adversarial training;distributionally robust optimization;deep learning;optimization;learning theory", "primary_area": "", "supplementary_material": "", "author": "Aman Sinha;Hongseok Namkoong;John Duchi", "authorids": "amans@stanford.edu;hnamk@stanford.edu;jduchi@stanford.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nsinha2018certifiable,\ntitle={Certifiable Distributional Robustness with Principled Adversarial Training},\nauthor={Aman Sinha and Hongseok Namkoong and John Duchi},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Hk6kPgZA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "9;9;9", "confidence": "4;4;5", "rating_avg": 9.0, "confidence_avg": 4.333333333333333, "replies_avg": 21, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 1237, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5504610656672947417&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=Hk6kPgZA-", "pdf": "https://openreview.net/pdf?id=Hk6kPgZA-", "email": ";;", "author_num": 3 }, { "title": "Not-So-Random Features", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/283", "id": "Hk8XMWgRb", "author_site": "Brian Bullins, Cyril Zhang, Yi Zhang", "tldr": "A simple and practical algorithm for learning a margin-maximizing translation-invariant or spherically symmetric kernel from training data, using tools from Fourier analysis and regret minimization.", "abstract": "We propose a principled method for kernel learning, which relies on a Fourier-analytic characterization of translation-invariant or rotation-invariant kernels. Our method produces a sequence of feature maps, iteratively refining the SVM margin. We provide rigorous guarantees for optimality and generalization, interpreting our algorithm as online equilibrium-finding dynamics in a certain two-player min-max game. Evaluations on synthetic and real-world datasets demonstrate scalability and consistent improvements over related random features-based methods.", "keywords": "kernel learning;random features;online learning", "primary_area": "", "supplementary_material": "", "author": "Brian Bullins;Cyril Zhang;Yi Zhang", "authorids": "bbullins@cs.princeton.edu;cyril.zhang@cs.princeton.edu;y.zhang@cs.princeton.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nbullins2018notsorandom,\ntitle={Not-So-Random Features},\nauthor={Brian Bullins and Cyril Zhang and Yi Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Hk8XMWgRb},\n}", "github": "[![github](/images/github_icon.svg) yz-ignescent/Not-So-Random-Features](https://github.com/yz-ignescent/Not-So-Random-Features)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "4;6;7", "confidence": "5;5;3", "rating_avg": 5.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": -0.7559289460184545, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16622124799980351573&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=Hk8XMWgRb", "pdf": "https://openreview.net/pdf?id=Hk8XMWgRb", "email": ";;", "author_num": 3 }, { "id": "Hk91SGWR-", "title": "Investigating Human Priors for Playing Video Games", "track": "main", "status": "Workshop", "tldr": "We investigate the various kinds of prior knowledge that help human learning and find that general priors about objects play the most critical role in guiding human gameplay.", "abstract": "What makes humans so good at solving seemingly complex video games? Unlike computers, humans bring in a great deal of prior knowledge about the world, enabling efficient decision making. This paper investigates the role of human priors for solving video games. Given a sample game, we conduct a series of ablation studies to quantify the importance of various priors. We do this by modifying the video game environment to systematically mask different types of visual information that could be used by humans as priors. We find that removal of some prior knowledge causes a drastic degradation in the speed with which human players solve the game, e.g. from 2 minutes to over 20 minutes. Furthermore, our results indicate that general priors, such as the importance of objects and visual consistency, are critical for efficient game-play.", "keywords": "Prior knowledge;Reinforcement learning;Cognitive Science", "primary_area": "", "supplementary_material": "", "author": "Rachit Dubey;Pulkit Agrawal;Deepak Pathak;Thomas L. Griffiths;Alexei A. Efros", "authorids": "rach0012@berkeley.edu;pulkitag@berkeley.edu;pathak@berkeley.edu;tom_griffiths@berkeley.edu;efros@eecs.berkeley.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ndubey2018investigating,\ntitle={Investigating Human Priors for Playing Video Games},\nauthor={Rachit Dubey and Pulkit Agrawal and Deepak Pathak and Thomas L. Griffiths and Alexei A. Efros},\nyear={2018},\nurl={https://openreview.net/forum?id=Hk91SGWR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Hk91SGWR-", "pdf_size": 0, "rating": "4;5;7", "confidence": "3;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 5, "corr_rating_confidence": 0.7559289460184545, "gs_citation": 210, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2202192690517876762&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "title": "Progressive Growing of GANs for Improved Quality, Stability, and Variation", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/204", "id": "Hk99zCeAb", "author_site": "Tero Karras, Timo Aila, Samuli Laine, Jaakko Lehtinen", "tldr": "We train generative adversarial networks in a progressive fashion, enabling us to generate high-resolution images with high quality.", "abstract": "We describe a new training methodology for generative adversarial networks. The key idea is to grow both the generator and discriminator progressively: starting from a low resolution, we add new layers that model increasingly fine details as training progresses. This both speeds the training up and greatly stabilizes it, allowing us to produce images of unprecedented quality, e.g., CelebA images at 1024^2. We also propose a simple way to increase the variation in generated images, and achieve a record inception score of 8.80 in unsupervised CIFAR10. Additionally, we describe several implementation details that are important for discouraging unhealthy competition between the generator and discriminator. Finally, we suggest a new metric for evaluating GAN results, both in terms of image quality and variation. As an additional contribution, we construct a higher-quality version of the CelebA dataset.", "keywords": "generative adversarial networks;unsupervised learning;hierarchical methods", "primary_area": "", "supplementary_material": "", "author": "Tero Karras;Timo Aila;Samuli Laine;Jaakko Lehtinen", "authorids": "tkarras@nvidia.com;taila@nvidia.com;slaine@nvidia.com;jlehtinen@nvidia.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nkarras2018progressive,\ntitle={Progressive Growing of {GAN}s for Improved Quality, Stability, and Variation},\nauthor={Tero Karras and Timo Aila and Samuli Laine and Jaakko Lehtinen},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Hk99zCeAb},\n}", "github": "[![github](/images/github_icon.svg) tkarras/progressive_growing_of_gans](https://github.com/tkarras/progressive_growing_of_gans) + [![Papers with Code](/images/pwc_icon.svg) 106 community implementations](https://paperswithcode.com/paper/?openreview=Hk99zCeAb)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "1;8;8", "confidence": "4;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 15, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 9833, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11486098150916361186&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=Hk99zCeAb", "pdf": "https://openreview.net/pdf?id=Hk99zCeAb", "email": ";;;", "author_num": 4 }, { "title": "On the Discrimination-Generalization Tradeoff in GANs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/248", "id": "Hk9Xc_lR-", "author_site": "Pengchuan Zhang, Qiang Liu, Dengyong Zhou, Tao Xu, Xiaodong He", "tldr": "This paper studies the discrimination and generalization properties of GANs when the discriminator set is a restricted function class like neural networks.", "abstract": "Generative adversarial training can be generally understood as minimizing certain moment matching loss defined by a set of discriminator functions, typically neural networks. The discriminator set should be large enough to be able to uniquely identify the true distribution (discriminative), and also be small enough to go beyond memorizing samples (generalizable). In this paper, we show that a discriminator set is guaranteed to be discriminative whenever its linear span is dense in the set of bounded continuous functions. This is a very mild condition satisfied even by neural networks with a single neuron. Further, we develop generalization bounds between the learned distribution and true distribution under different evaluation metrics. When evaluated with neural distance, our bounds show that generalization is guaranteed as long as the discriminator set is small enough, regardless of the size of the generator or hypothesis set. When evaluated with KL divergence, our bound provides an explanation on the counter-intuitive behaviors of testing likelihood in GAN training. Our analysis sheds lights on understanding the practical performance of GANs.", "keywords": "generative adversarial network;discrimination;generalization", "primary_area": "", "supplementary_material": "", "author": "Pengchuan Zhang;Qiang Liu;Dengyong Zhou;Tao Xu;Xiaodong He", "authorids": "penzhan@microsoft.com;qiang.liu@dartmouth.edu;dennyzhou@google.com;tax313@lehigh.edu;xiaohe@microsoft.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nzhang2018on,\ntitle={On the Discrimination-Generalization Tradeoff in {GAN}s},\nauthor={Pengchuan Zhang and Qiang Liu and Dengyong Zhou and Tao Xu and Xiaodong He},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Hk9Xc_lR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "pdf_size": 0, "rating": "3;6;7", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "openreview": "https://openreview.net/forum?id=Hk9Xc_lR-", "pdf": "https://openreview.net/pdf?id=Hk9Xc_lR-", "email": ";;;;", "author_num": 5 }, { "title": "A Deep Reinforced Model for Abstractive Summarization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/279", "id": "HkAClQgA-", "author_site": "Romain Paulus, Caiming Xiong, richard socher", "tldr": "A summarization model combining a new intra-attention and reinforcement learning method to increase summary ROUGE scores and quality for long sequences.", "abstract": "Attentional, RNN-based encoder-decoder models for abstractive summarization have achieved good performance on short input and output sequences. For longer documents and summaries however these models often include repetitive and incoherent phrases. We introduce a neural network model with a novel intra-attention that attends over the input and continuously generated output separately, and a new training method that combines standard supervised word prediction and reinforcement learning (RL). \nModels trained only with supervised learning often exhibit \"exposure bias\" - they assume ground truth is provided at each step during training.\nHowever, when standard word prediction is combined with the global sequence prediction training of RL the resulting summaries become more readable.\nWe evaluate this model on the CNN/Daily Mail and New York Times datasets. Our model obtains a 41.16 ROUGE-1 score on the CNN/Daily Mail dataset, an improvement over previous state-of-the-art models. Human evaluation also shows that our model produces higher quality summaries.", "keywords": "deep learning;natural language processing;reinforcement learning;text summarization;sequence generation", "primary_area": "", "supplementary_material": "", "author": "Romain Paulus;Caiming Xiong;Richard Socher", "authorids": "rpaulus@salesforce.com;cxiong@salesforce.com;richard@socher.org", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\npaulus2018a,\ntitle={A Deep Reinforced Model for Abstractive Summarization},\nauthor={Romain Paulus and Caiming Xiong and Richard Socher},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HkAClQgA-},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 10 community implementations](https://paperswithcode.com/paper/?openreview=HkAClQgA-)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;5;3", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": -0.5, "gs_citation": 2047, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=439043726958667778&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HkAClQgA-", "pdf": "https://openreview.net/pdf?id=HkAClQgA-", "email": ";;", "author_num": 3 }, { "id": "HkCnm-bAb", "title": "Can Deep Reinforcement Learning solve Erdos-Selfridge-Spencer Games?", "track": "main", "status": "Workshop", "tldr": "We adapt a family of combinatorial games with tunable difficulty and an optimal policy expressible as linear network, developing it as a rich environment for reinforcement learning, showing contrasts in performance with supervised learning, and analyzing multiagent learning and generalization. ", "abstract": "Deep reinforcement learning has achieved many recent successes, but our understanding of its strengths and limitations is hampered by the lack of rich environments in which we can fully characterize optimal behavior, and correspondingly diagnose individual actions against such a characterization. \n\nHere we consider a family of combinatorial games, arising from work of Erdos, Selfridge, and Spencer, and we propose their use as environments for evaluating and comparing different approaches to reinforcement learning. These games have a number of appealing features: they are challenging for current learning approaches, but they form (i) a low-dimensional, simply parametrized environment where (ii) there is a linear closed form solution for optimal behavior from any state, and (iii) the difficulty of the game can be tuned by changing environment parameters in an interpretable way. We use these Erdos-Selfridge-Spencer games not only to compare different algorithms, but also to compare approaches based on supervised and reinforcement learning, to analyze the power of multi-agent approaches in improving performance, and to evaluate generalization to environments outside the training set. ", "keywords": "deep learning;deep reinforcement learning;combinatorial games;optimality", "primary_area": "", "supplementary_material": "", "author": "Maithra Raghu;Alex Irpan;Jacob Andreas;Robert Kleinberg;Quoc Le;Jon Kleinberg", "authorids": "maithrar@gmail.com;alexirpan@google.com;j.d.andreas@gmail.com;rdk@cs.cornell.edu;qvl@google.com;kleinber@cs.cornell.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nraghu2018can,\ntitle={Can Deep Reinforcement Learning solve Erdos-Selfridge-Spencer Games?},\nauthor={Maithra Raghu and Alex Irpan and Jacob Andreas and Robert Kleinberg and Quoc Le and Jon Kleinberg},\nyear={2018},\nurl={https://openreview.net/forum?id=HkCnm-bAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HkCnm-bAb", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;3;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.0, "replies_avg": 17, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5045759722516886464&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "title": "Generative Models of Visually Grounded Imagination", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/218", "id": "HkCsm6lRb", "author_site": "Shanmukha Ramakrishna Vedantam, Ian Fischer, Jonathan Huang, Kevin P Murphy", "tldr": "A VAE-variant which can create diverse images corresponding to novel concrete or abstract \"concepts\" described using attribute vectors.", "abstract": "It is easy for people to imagine what a man with pink hair looks like, even if they have never seen such a person before. We call the ability to create images of novel semantic concepts visually grounded imagination. In this paper, we show how we can modify variational auto-encoders to perform this task. Our method uses a novel training objective, and a novel product-of-experts inference network, which can handle partially specified (abstract) concepts in a principled and efficient way. We also propose a set of easy-to-compute evaluation metrics that capture our intuitive notions of what it means to have good visual imagination, namely correctness, coverage, and compositionality (the 3 C\u2019s). Finally, we perform a detailed comparison of our method with two existing joint image-attribute VAE methods (the JMVAE method of Suzuki et al., 2017 and the BiVCCA method of Wang et al., 2016) by applying them to two datasets: the MNIST-with-attributes dataset (which we introduce here), and the CelebA dataset (Liu et al., 2015).", "keywords": "variational autoencoders;generative models;language;vision;abstraction;compositionality;hierarchy", "primary_area": "", "supplementary_material": "", "author": "Ramakrishna Vedantam;Ian Fischer;Jonathan Huang;Kevin Murphy", "authorids": "vrama@gatech.edu;iansf@google.com;jonathanhuang@google.com;murphyk@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nvedantam2018generative,\ntitle={Generative Models of Visually Grounded Imagination},\nauthor={Ramakrishna Vedantam and Ian Fischer and Jonathan Huang and Kevin Murphy},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HkCsm6lRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;4;3", "rating_avg": 7.0, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 170, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7259828402586264383&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=HkCsm6lRb", "pdf": "https://openreview.net/pdf?id=HkCsm6lRb", "email": ";;;", "author_num": 4 }, { "id": "HkCvZXbC-", "title": "3C-GAN: AN CONDITION-CONTEXT-COMPOSITE GENERATIVE ADVERSARIAL NETWORKS FOR GENERATING IMAGES SEPARATELY", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present 3C-GAN: a novel multiple generators structures, that contains one conditional generator that generates a semantic part of an image conditional on its input label, and one context generator generates the rest of an image. Compared to original GAN model, this model has multiple generators and gives control over what its generators should generate. Unlike previous multi-generator models use a subsequent generation process, that one layer is generated given the previous layer, our model uses a process of generating different part of the images together. This way the model contains fewer parameters and the generation speed is faster. Speci\ufb01cally, the model leverages the label information to separate the object from the image correctly. Since the model conditional on the label information does not restrict to generate other parts of an image, we proposed a cost function that encourages the model to generate only the succinct part of an image in terms of label discrimination. We also found an exclusive prior on the mask of the model help separate the object. The experiments on MNIST, SVHN, and CelebA datasets show 3C-GAN can generate different objects with different generators simultaneously, according to the labels given to each generator.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yeu-Chern Harn;Vladimir Jojic", "authorids": "ycharn@cs.unc.edu;vjojic@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nharn2018cgan,\ntitle={3C-{GAN}: {AN} {CONDITION}-{CONTEXT}-{COMPOSITE} {GENERATIVE} {ADVERSARIAL} {NETWORKS} {FOR} {GENERATING} {IMAGES} {SEPARATELY}},\nauthor={Yeu-Chern Harn and Vladimir Jojic},\nyear={2018},\nurl={https://openreview.net/forum?id=HkCvZXbC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HkCvZXbC-", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;4;5", "rating_avg": 4.333333333333333, "confidence_avg": 4.666666666666667, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1330924967150321687&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "HkCy2uqQM", "title": "Complex- and Real-Valued Neural Network Architectures", "track": "main", "status": "Withdraw", "tldr": "Comparison of complex- and real-valued multi-layer perceptron with respect to the number of real-valued parameters.", "abstract": "Complex-value neural networks are not a new concept, however, the use of real-values has often been favoured over complex-values due to difficulties in training and accuracy of results. Existing literature ignores the number of parameters used. We compared complex- and real-valued neural networks using five activation functions. We found that when real and complex neural networks are compared using simple classification tasks, complex neural networks perform equal to or slightly worse than real-value neural networks. However, when specialised architecture is used, complex-valued neural networks outperform real-valued neural networks. Therefore, complex\u2013valued neural networks should be used when the input data is also complex or it can be meaningfully to the complex plane, or when the network architecture uses the structure defined by using complex numbers.", "keywords": "complex numbers;complex-valued;neural;network;multi-layer;perceptron;architecture", "primary_area": "", "supplementary_material": "", "author": "Nils Moenning;Suresh Manandhar", "authorids": "nm819@york.ac.uk;suresh.manandhar@york.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HkCy2uqQM", "pdf_size": 0, "rating": "2;3;4", "confidence": "5;4;4", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 6, "authors#_avg": 2, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=146233169912416915&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "HkGJUXb0-", "title": "Learning Efficient Tensor Representations with Ring Structure Networks", "track": "main", "status": "Workshop", "tldr": "", "abstract": "\\emph{Tensor train (TT) decomposition} is a powerful representation for high-order tensors, which has been successfully applied to various machine learning tasks in recent years. In this paper, we propose a more generalized tensor decomposition with ring structure network by employing circular multilinear products over a sequence of lower-order core tensors, which is termed as TR representation. Several learning algorithms including blockwise ALS with adaptive tensor ranks and SGD with high scalability are presented. Furthermore, the mathematical properties are investigated, which enables us to perform basic algebra operations in a computationally efficiently way by using TR representations. Experimental results on synthetic signals and real-world datasets demonstrate the effectiveness of TR model and the learning algorithms. In particular, we show that the structure information and high-order correlations within a 2D image can be captured efficiently by employing tensorization and TR representation. \n", "keywords": "Tensor Decomposition;Tensor Networks;Stochastic Gradient Descent", "primary_area": "", "supplementary_material": "", "author": "Qibin Zhao;Masashi Sugiyama;Longhao Yuan;Andrzej Cichocki", "authorids": "qibin.zhao@riken.jp;sugi@k.u-tokyo.ac.jp;longhao.yuan@riken.jp;a.cichocki@riken.jp", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhao2018learning,\ntitle={Learning Efficient Tensor Representations with Ring Structure Networks},\nauthor={Qibin Zhao and Masashi Sugiyama and Longhao Yuan and Andrzej Cichocki},\nyear={2018},\nurl={https://openreview.net/forum?id=HkGJUXb0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer5;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HkGJUXb0-", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 112, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6150302399397072593&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "HkGcX--0-", "title": "Auxiliary Guided Autoregressive Variational Autoencoders", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generative modeling of high-dimensional data is a key problem in machine learning. Successful approaches include latent variable models and autoregressive models. The complementary strengths of these approaches, to model global and local image statistics respectively, suggest hybrid models combining the strengths of both models. Our contribution is to train such hybrid models using an auxiliary loss function that controls which information is captured by the latent variables and what is left to the autoregressive decoder. In contrast, prior work on such hybrid models needed to limit the capacity of the autoregressive decoder to prevent degenerate models that ignore the latent variables and only rely on autoregressive modeling. Our approach results in models with meaningful latent variable representations, and which rely on powerful autoregressive decoders to model image details. Our model generates qualitatively convincing samples, and yields state-of-the-art quantitative results.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Thomas Lucas;Jakob Verbeek", "authorids": "thomas.lucas@inria.fr;jakob.verbeek@inria.fr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlucas2018auxiliary,\ntitle={Auxiliary Guided Autoregressive Variational Autoencoders},\nauthor={Thomas Lucas and Jakob Verbeek},\nyear={2018},\nurl={https://openreview.net/forum?id=HkGcX--0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HkGcX--0-", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 16, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5285548936075503803&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12 }, { "id": "HkJ1rgbCb", "title": "Using Deep Reinforcement Learning to Generate Rationales for Molecules", "track": "main", "status": "Reject", "tldr": "We use a reinforcement learning over molecular graphs to generate rationales for interpretable molecular property prediction.", "abstract": "Deep learning algorithms are increasingly used in modeling chemical processes. However, black box predictions without rationales have limited used in practical applications, such as drug design. To this end, we learn to identify molecular substructures -- rationales -- that are associated with the target chemical property (e.g., toxicity). The rationales are learned in an unsupervised fashion, requiring no additional information beyond the end-to-end task. We formulate this problem as a reinforcement learning problem over the molecular graph, parametrized by two convolution networks corresponding to the rationale selection and prediction based on it, where the latter induces the reward function. We evaluate the approach on two benchmark toxicity datasets. We demonstrate that our model sustains high performance under the additional constraint that predictions strictly follow the rationales. Additionally, we validate the extracted rationales through comparison against those described in chemical literature and through synthetic experiments. ", "keywords": "Reinforcement Learning;Chemistry;Interpretable Models", "primary_area": "", "supplementary_material": "", "author": "Benson Chen;Connor Coley;Regina Barzilay;Tommi Jaakkola", "authorids": "bensonc@mit.edu;ccoley@mit.edu;regina@csail.mit.edu;tommi@csail.mit.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nchen2018using,\ntitle={Using Deep Reinforcement Learning to Generate Rationales for Molecules},\nauthor={Benson Chen and Connor Coley and Regina Barzilay and Tommi Jaakkola},\nyear={2018},\nurl={https://openreview.net/forum?id=HkJ1rgbCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HkJ1rgbCb", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;4;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4068039030609861271&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Wasserstein Auto-Encoders", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/182", "id": "HkL7n1-0b", "author_site": "Ilya Tolstikhin, Olivier Bousquet, Sylvain Gelly, Bernhard Schoelkopf", "tldr": "We propose a new auto-encoder based on the Wasserstein distance, which improves on the sampling properties of VAE.", "abstract": "We propose the Wasserstein Auto-Encoder (WAE)---a new algorithm for building a generative model of the data distribution. WAE minimizes a penalized form of the Wasserstein distance between the model distribution and the target distribution, which leads to a different regularizer than the one used by the Variational Auto-Encoder (VAE).\nThis regularizer encourages the encoded training distribution to match the prior. We compare our algorithm with several other techniques and show that it is a generalization of adversarial auto-encoders (AAE). Our experiments show that WAE shares many of the properties of VAEs (stable training, encoder-decoder architecture, nice latent manifold structure) while generating samples of better quality.", "keywords": "auto-encoder;generative models;GAN;VAE;unsupervised learning", "primary_area": "", "supplementary_material": "", "author": "Ilya Tolstikhin;Olivier Bousquet;Sylvain Gelly;Bernhard Schoelkopf", "authorids": "iliya.tolstikhin@gmail.com;obousquet@gmail.com;sylvain.gelly@gmail.com;bs@tuebingen.mpg.de", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ntolstikhin2018wasserstein,\ntitle={Wasserstein Auto-Encoders},\nauthor={Ilya Tolstikhin and Olivier Bousquet and Sylvain Gelly and Bernhard Schoelkopf},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HkL7n1-0b},\n}", "github": "[![github](/images/github_icon.svg) tolstikhin/wae](https://github.com/tolstikhin/wae) + [![Papers with Code](/images/pwc_icon.svg) 12 community implementations](https://paperswithcode.com/paper/?openreview=HkL7n1-0b)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "8;8;8", "confidence": "3;4;3", "rating_avg": 8.0, "confidence_avg": 3.3333333333333335, "replies_avg": 21, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 1423, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1669877132293977025&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=HkL7n1-0b", "pdf": "https://openreview.net/pdf?id=HkL7n1-0b", "email": ";;;", "author_num": 4 }, { "id": "HkMCybx0-", "title": "Improving Deep Learning by Inverse Square Root Linear Units (ISRLUs)", "track": "main", "status": "Reject", "tldr": "We introduce the ISRLU activation function which is continuously differentiable and faster than ELU. The related ISRU replaces tanh & sigmoid.", "abstract": "We introduce the \u201cinverse square root linear unit\u201d (ISRLU) to speed up learning in deep neural networks. ISRLU has better performance than ELU but has many of the same benefits. ISRLU and ELU have similar curves and characteristics. Both have negative values, allowing them to push mean unit activation closer to zero, and bring the normal gradient closer to the unit natural gradient, ensuring a noise- robust deactivation state, lessening the over fitting risk. The significant performance advantage of ISRLU on traditional CPUs also carry over to more efficient HW implementations on HW/SW codesign for CNNs/RNNs. In experiments with TensorFlow, ISRLU leads to faster learning and better generalization than ReLU on CNNs. This work also suggests a computationally efficient variant called the \u201cinverse square root unit\u201d (ISRU) which can be used for RNNs. Many RNNs use either long short-term memory (LSTM) and gated recurrent units (GRU) which are implemented with tanh and sigmoid activation functions. ISRU has less computational complexity but still has a similar curve to tanh and sigmoid.", "keywords": "Deep learning;Theory", "primary_area": "", "supplementary_material": "", "author": "Brad Carlile;Guy Delamarter;Paul Kinney;Akiko Marti;Brian Whitney", "authorids": "bradcarlile@yahoo.com;;;;", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ncarlile2018improving,\ntitle={Improving Deep Learning by Inverse Square Root Linear Units ({ISRLU}s)},\nauthor={Brad Carlile and Guy Delamarter and Paul Kinney and Akiko Marti and Brian Whitney},\nyear={2018},\nurl={https://openreview.net/forum?id=HkMCybx0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HkMCybx0-", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14608040289778925184&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "HkMhoDITb", "title": "Reinforcement Learning via Replica Stacking of Quantum Measurements for the Training of Quantum Boltzmann Machines", "track": "main", "status": "Reject", "tldr": "We train Quantum Boltzmann Machines using a replica stacking method and a quantum annealer to perform a reinforcement learning task.", "abstract": "Recent theoretical and experimental results suggest the possibility of using current and near-future quantum hardware in challenging sampling tasks. In this paper, we introduce free-energy-based reinforcement learning (FERL) as an application of quantum hardware. We propose a method for processing a quantum annealer\u2019s measured qubit spin configurations in approximating the free energy of a quantum Boltzmann machine (QBM). We then apply this method to perform reinforcement learning on the grid-world problem using the D-Wave 2000Q quantum annealer. The experimental results show that our technique is a promising method for harnessing the power of quantum sampling in reinforcement learning tasks.", "keywords": "Quantum Annealing;Reinforcement Learning;Boltzmann Machines;Markov Chain Monte Carlo", "primary_area": "", "supplementary_material": "", "author": "Anna Levit;\u0006 Daniel Crawford;Navid Ghadermarzy;Jaspreet S. Oberoi;Ehsan Zahedinejad;Pooya Ronagh", "authorids": "anna.levit@1qbit.com;daniel.crawford@1qbit.com;navid.ghadermarzy@1qbit.com;jaspreet.oberoi@1qbit.com;ehsan.zahedinejad@1qbit.com;pooya.ronagh@1qbit.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nlevit2018reinforcement,\ntitle={Reinforcement Learning via Replica Stacking of Quantum Measurements for the Training of Quantum Boltzmann Machines},\nauthor={Anna Levit and \u0006 Daniel Crawford and Navid Ghadermarzy and Jaspreet S. Oberoi and Ehsan Zahedinejad and Pooya Ronagh},\nyear={2018},\nurl={https://openreview.net/forum?id=HkMhoDITb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HkMhoDITb", "pdf_size": 0, "rating": "4;4;6", "confidence": "3;3;4", "rating_avg": 4.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 8, "authors#_avg": 6, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:l_e8miQ2HCgJ:scholar.google.com/&scioq=Reinforcement+Learning+via+Replica+Stacking+of+Quantum+Measurements+for+the+Training+of+Quantum+Boltzmann+Machines&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Learning Latent Representations in Neural Networks for Clustering through Pseudo Supervision and Graph-based Activity Regularization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/250", "id": "HkMvEOlAb", "author_site": "Ozsel Kilinc, Ismail Uysal", "tldr": "", "abstract": "In this paper, we propose a novel unsupervised clustering approach exploiting the hidden information that is indirectly introduced through a pseudo classification objective. Specifically, we randomly assign a pseudo parent-class label to each observation which is then modified by applying the domain specific transformation associated with the assigned label. Generated pseudo observation-label pairs are subsequently used to train a neural network with Auto-clustering Output Layer (ACOL) that introduces multiple softmax nodes for each pseudo parent-class. Due to the unsupervised objective based on Graph-based Activity Regularization (GAR) terms, softmax duplicates of each parent-class are specialized as the hidden information captured through the help of domain specific transformations is propagated during training. Ultimately we obtain a k-means friendly latent representation. Furthermore, we demonstrate how the chosen transformation type impacts performance and helps propagate the latent information that is useful in revealing unknown clusters. Our results show state-of-the-art performance for unsupervised clustering tasks on MNIST, SVHN and USPS datasets, with the highest accuracies reported to date in the literature.", "keywords": "representation learning;unsupervised clustering;pseudo supervision;graph-based activity regularization;auto-clustering output layer", "primary_area": "", "supplementary_material": "", "author": "Ozsel Kilinc;Ismail Uysal", "authorids": "ozsel@mail.usf.edu;iuysal@usf.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nkilinc2018learning,\ntitle={Learning Latent Representations in Neural Networks for Clustering through Pseudo Supervision and Graph-based Activity Regularization},\nauthor={Ozsel Kilinc and Ismail Uysal},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HkMvEOlAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "5;4;3", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 15, "authors#_avg": 2, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11376218691210990570&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HkMvEOlAb", "pdf": "https://openreview.net/pdf?id=HkMvEOlAb", "email": ";", "author_num": 2 }, { "title": "On the Expressive Power of Overlapping Architectures of Deep Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/230", "id": "HkNGsseC-", "author_site": "Or Sharir, Amnon Shashua", "tldr": "We analyze how the degree of overlaps between the receptive fields of a convolutional network affects its expressive power.", "abstract": "Expressive efficiency refers to the relation between two architectures A and B, whereby any function realized by B could be replicated by A, but there exists functions realized by A, which cannot be replicated by B unless its size grows significantly larger. For example, it is known that deep networks are exponentially efficient with respect to shallow networks, in the sense that a shallow network must grow exponentially large in order to approximate the functions represented by a deep network of polynomial size. In this work, we extend the study of expressive efficiency to the attribute of network connectivity and in particular to the effect of \"overlaps\" in the convolutional process, i.e., when the stride of the convolution is smaller than its filter size (receptive field).\nTo theoretically analyze this aspect of network's design, we focus on a well-established surrogate for ConvNets called Convolutional Arithmetic Circuits (ConvACs), and then demonstrate empirically that our results hold for standard ConvNets as well. Specifically, our analysis shows that having overlapping local receptive fields, and more broadly denser connectivity, results in an exponential increase in the expressive capacity of neural networks. Moreover, while denser connectivity can increase the expressive capacity, we show that the most common types of modern architectures already exhibit exponential increase in expressivity, without relying on fully-connected layers.", "keywords": "Deep Learning;Expressive Efficiency;Overlapping;Receptive Fields", "primary_area": "", "supplementary_material": "", "author": "Or Sharir;Amnon Shashua", "authorids": "or.sharir@cs.huji.ac.il;shashua@cs.huji.ac.il", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nsharir2018on,\ntitle={On the Expressive Power of Overlapping Architectures of Deep Learning},\nauthor={Or Sharir and Amnon Shashua},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HkNGsseC-},\n}", "github": "[![github](/images/github_icon.svg) HUJI-Deep/OverlapsAndExpressiveness](https://github.com/HUJI-Deep/OverlapsAndExpressiveness)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;8", "confidence": "4;4;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17865700268037263115&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HkNGsseC-", "pdf": "https://openreview.net/pdf?id=HkNGsseC-", "email": ";", "author_num": 2 }, { "id": "HkOhuyA6-", "title": "Graph Classification with 2D Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "We introduce a novel way to represent graphs as multi-channel image-like structures that allows them to be handled by vanilla 2D CNNs.", "abstract": "Graph classification is currently dominated by graph kernels, which, while powerful, suffer some significant limitations. Convolutional Neural Networks (CNNs) offer a very appealing alternative. However, processing graphs with CNNs is not trivial. To address this challenge, many sophisticated extensions of CNNs have recently been proposed. In this paper, we reverse the problem: rather than proposing yet another graph CNN model, we introduce a novel way to represent graphs as multi-channel image-like structures that allows them to be handled by vanilla 2D CNNs. Despite its simplicity, our method proves very competitive to state-of-the-art graph kernels and graph CNNs, and outperforms them by a wide margin on some datasets. It is also preferable to graph kernels in terms of time complexity. Code and data are publicly available.", "keywords": "graph classification;convolutional neural networks;2D CNN;representation", "primary_area": "", "supplementary_material": "", "author": "Antoine J.-P. Tixier;Giannis Nikolentzos;Polykarpos Meladianos;Michalis Vazirgiannis", "authorids": "antoine.tixier-1@colorado.edu;giannisnik@hotmail.com;p.meladianos@gmail.com;mvazirg@lix.polytechnique.fr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nj.-p.2018graph,\ntitle={Graph Classification with 2D Convolutional Neural Networks},\nauthor={Antoine J.-P. Tixier and Giannis Nikolentzos and Polykarpos Meladianos and Michalis Vazirgiannis},\nyear={2018},\nurl={https://openreview.net/forum?id=HkOhuyA6-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HkOhuyA6-", "pdf_size": 0, "rating": "3;4;7", "confidence": "5;3;3", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": -0.6933752452815365, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=598852908918437087&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "HkPCrEZ0Z", "title": "Combining Model-based and Model-free RL via Multi-step Control Variates", "track": "main", "status": "Reject", "tldr": "", "abstract": "Model-free deep reinforcement learning algorithms are able to successfully solve a wide range of continuous control tasks, but typically require many on-policy samples to achieve good performance. Model-based RL algorithms are sample-efficient on the other hand, while learning accurate global models of complex dynamic environments has turned out to be tricky in practice, which leads to the unsatisfactory performance of the learned policies. In this work, we combine the sample-efficiency of model-based algorithms and the accuracy of model-free algorithms. We leverage multi-step neural network based predictive models by embedding real trajectories into imaginary rollouts of the model, and use the imaginary cumulative rewards as control variates for model-free algorithms. In this way, we achieved the strengths of both sides and derived an estimator which is not only sample-efficient, but also unbiased and of very low variance. We present our evaluation on the MuJoCo and OpenAI Gym benchmarks. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tong Che;Yuchen Lu;George Tucker;Surya Bhupatiraju;Shane Gu;Sergey Levine;Yoshua Bengio", "authorids": "gerryche@berkeley.edu;luyuchen.paul@gmail.com;gjt@google.com;sbhupatiraju@google.com;shanegu@google.com;svlevine@eecs.berkeley.edu;bengioy@iro.umontreal.ca", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nche2018combining,\ntitle={Combining Model-based and Model-free {RL} via Multi-step Control Variates},\nauthor={Tong Che and Yuchen Lu and George Tucker and Surya Bhupatiraju and Shane Gu and Sergey Levine and Yoshua Bengio},\nyear={2018},\nurl={https://openreview.net/forum?id=HkPCrEZ0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HkPCrEZ0Z", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 6, "authors#_avg": 7, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5126788791000118366&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "HkSZyinVG", "title": "Improved Learning in Convolutional Neural Networks with Shifted Exponential Linear Units (ShELUs)", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The Exponential Linear Unit (ELU) has been proven to speed up learning and improve the classification performance over activation functions such as ReLU and Leaky ReLU for convolutional neural networks. The reasons behind the improved behavior are that ELU reduces the bias shift, it saturates for large negative inputs and it is continuously differentiable. However, it remains open whether ELU has the optimal shape and we address the quest for a superior activation function.\nWe use a new formulation to tune a piecewise linear activation function during training, to investigate the above question, and learn the shape of the locally optimal activation function. With this tuned activation function, the classification performance is improved and the resulting, learned activation function shows to be ELU-shaped irrespective if it is initialized as a RELU, LReLU or ELU. Interestingly, the learned activation function does not exactly pass through the origin indicating that a shifted ELU-shaped activation function is preferable. This observation leads us to introduce the Shifted Exponential Linear Unit (ShELU) as a new activation function.\nExperiments on Cifar-100 show that the classification performance is further improved when using the ShELU activation function in comparison with ELU. The improvement is achieved when learning an individual bias shift for each neuron.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anonymous", "authorids": "ICLR.cc/2018/Conference/Paper459/Authors", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@article{\n anonymous2018improved,\n title={Improved Learning in Convolutional Neural Networks with Shifted Exponential Linear Units (ShELUs)},\n author={Anonymous},\n journal={International Conference on Learning Representations},\n year={2018},\n url={https://openreview.net/forum?id=rk98KCgRW}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=HkSZyinVG", "pdf_size": 0, "rating": "1;3;4", "confidence": "5;5;5", "rating_avg": 2.6666666666666665, "confidence_avg": 5.0, "replies_avg": 8, "authors#_avg": 1, "corr_rating_confidence": 0.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15880331091879249464&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "title": "Attacking Binarized Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/47", "id": "HkTEFfZRb", "author_site": "Angus Galloway, Graham W Taylor, Medhat Moussa", "tldr": "We conduct adversarial attacks against binarized neural networks and show that we reduce the impact of the strongest attacks, while maintaining comparable accuracy in a black-box setting", "abstract": "Neural networks with low-precision weights and activations offer compelling\nefficiency advantages over their full-precision equivalents. The two most\nfrequently discussed benefits of quantization are reduced memory consumption,\nand a faster forward pass when implemented with efficient bitwise\noperations. We propose a third benefit of very low-precision neural networks:\nimproved robustness against some adversarial attacks, and in the worst case,\nperformance that is on par with full-precision models. We focus on the very\nlow-precision case where weights and activations are both quantized to $\\pm$1,\nand note that stochastically quantizing weights in just one layer can sharply\nreduce the impact of iterative attacks. We observe that non-scaled binary neural\nnetworks exhibit a similar effect to the original \\emph{defensive distillation}\nprocedure that led to \\emph{gradient masking}, and a false notion of security.\nWe address this by conducting both black-box and white-box experiments with\nbinary models that do not artificially mask gradients.", "keywords": "adversarial examples;adversarial attacks;binary;binarized neural networks", "primary_area": "", "supplementary_material": "", "author": "Angus Galloway;Graham W. Taylor;Medhat Moussa", "authorids": "gallowaa@uoguelph.ca;gwtaylor@uoguelph.ca;mmoussa@uoguelph.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ngalloway2018attacking,\ntitle={Attacking Binarized Neural Networks},\nauthor={Angus Galloway and Graham W. Taylor and Medhat Moussa},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HkTEFfZRb},\n}", "github": "[![github](/images/github_icon.svg) AngusG/cleverhans-attacking-bnns](https://github.com/AngusG/cleverhans-attacking-bnns)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "5;4;3", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 125, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4964512256521124807&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=HkTEFfZRb", "pdf": "https://openreview.net/pdf?id=HkTEFfZRb", "email": ";;", "author_num": 3 }, { "title": "SEARNN: Training RNNs with global-local losses", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/191", "id": "HkUR_y-RZ", "author_site": "R\u00e9mi Leblond, Jean-Baptiste Alayrac, Anton Osokin, Simon Lacoste-Julien", "tldr": "We introduce SeaRNN, a novel algorithm for RNN training, inspired by the learning to search approach to structured prediction, in order to avoid the limitations of MLE training.", "abstract": "We propose SEARNN, a novel training algorithm for recurrent neural networks (RNNs) inspired by the \"learning to search\" (L2S) approach to structured prediction. RNNs have been widely successful in structured prediction applications such as machine translation or parsing, and are commonly trained using maximum likelihood estimation (MLE). Unfortunately, this training loss is not always an appropriate surrogate for the test error: by only maximizing the ground truth probability, it fails to exploit the wealth of information offered by structured losses. Further, it introduces discrepancies between training and predicting (such as exposure bias) that may hurt test performance. Instead, SEARNN leverages test-alike search space exploration to introduce global-local losses that are closer to the test error. We first demonstrate improved performance over MLE on two different tasks: OCR and spelling correction. Then, we propose a subsampling strategy to enable SEARNN to scale to large vocabulary sizes. This allows us to validate the benefits of our approach on a machine translation task.", "keywords": "Structured prediction;RNNs", "primary_area": "", "supplementary_material": "", "author": "R\u00e9mi Leblond;Jean-Baptiste Alayrac;Anton Osokin;Simon Lacoste-Julien", "authorids": "remi.leblond@inria.fr;jean-baptiste.alayrac@inria.fr;aosokin@hse.ru;slacoste@iro.umontreal.ca", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nleblond2018searnn,\ntitle={{SEARNN}: Training {RNN}s with global-local losses},\nauthor={R\u00e9mi Leblond and Jean-Baptiste Alayrac and Anton Osokin and Simon Lacoste-Julien},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HkUR_y-RZ},\n}", "github": "[![github](/images/github_icon.svg) RemiLeblond/SeaRNN-open](https://github.com/RemiLeblond/SeaRNN-open)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "5;7;8", "confidence": "5;3;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 16, "authors#_avg": 4, "corr_rating_confidence": -0.6546536707079772, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "openreview": "https://openreview.net/forum?id=HkUR_y-RZ", "pdf": "https://openreview.net/pdf?id=HkUR_y-RZ", "email": ";;;", "author_num": 4 }, { "title": "Towards Image Understanding from Deep Compression Without Decoding", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/21", "id": "HkXWCMbRW", "author_site": "R\u00f3bert Torfason, Fabian Mentzer, Eirikur Agustsson, Michael Tschannen, Radu Timofte, Luc Van Gool", "tldr": "", "abstract": "Motivated by recent work on deep neural network (DNN)-based image compression methods showing potential improvements in image quality, savings in storage, and bandwidth reduction, we propose to perform image understanding tasks such as classification and segmentation directly on the compressed representations produced by these compression methods. Since the encoders and decoders in DNN-based compression methods are neural networks with feature-maps as internal representations of the images, we directly integrate these with architectures for image understanding. This bypasses decoding of the compressed representation into RGB space and reduces computational cost. Our study shows that accuracies comparable to networks that operate on compressed RGB images can be achieved while reducing the computational complexity up to $2\\times$. Furthermore, we show that synergies are obtained by jointly training compression networks with classification networks on the compressed representations, improving image quality, classification accuracy, and segmentation performance. We find that inference from compressed representations is particularly advantageous compared to inference from compressed RGB images for aggressive compression rates.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Robert Torfason;Fabian Mentzer;Eirikur Agustsson;Michael Tschannen;Radu Timofte;Luc Van Gool", "authorids": "robertto@student.ethz.ch;mentzerf@vision.ee.ethz.ch;aeirikur@vision.ee.ethz.ch;michaelt@nari.ee.ethz.ch;radu.timofte@vision.ee.ethz.ch;vangool@vision.ee.ethz.ch", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\ntorfason2018towards,\ntitle={Towards Image Understanding from Deep Compression Without Decoding},\nauthor={R\u00f3bert Torfason and Fabian Mentzer and Eir\u00edkur \u00c1g\u00fastsson and Michael Tschannen and Radu Timofte and Luc Van Gool},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HkXWCMbRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;9", "confidence": "3;4;5", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 6, "corr_rating_confidence": 0.8660254037844387, "gs_citation": 200, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10655381109239631918&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=HkXWCMbRW", "pdf": "https://openreview.net/pdf?id=HkXWCMbRW", "email": ";;;;;", "author_num": 6 }, { "title": "Temporally Efficient Deep Learning with Spikes", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/135", "id": "HkZy-bW0-", "author_site": "Peter OConnor, Efstratios Gavves, Matthias Reisser, Max Welling", "tldr": "An algorithm for training neural networks efficiently on temporally redundant data.", "abstract": "The vast majority of natural sensory data is temporally redundant. For instance, video frames or audio samples which are sampled at nearby points in time tend to have similar values. Typically, deep learning algorithms take no advantage of this redundancy to reduce computations. This can be an obscene waste of energy. We present a variant on backpropagation for neural networks in which computation scales with the rate of change of the data - not the rate at which we process the data. We do this by implementing a form of Predictive Coding wherein neurons communicate a combination of their state, and their temporal change in state, and quantize this signal using Sigma-Delta modulation. Intriguingly, this simple communication rule give rise to units that resemble biologically-inspired leaky integrate-and-fire neurons, and to a spike-timing-dependent weight-update similar to Spike-Timing Dependent Plasticity (STDP), a synaptic learning rule observed in the brain. We demonstrate that on MNIST, on a temporal variant of MNIST, and on Youtube-BB, a dataset with videos in the wild, our algorithm performs about as well as a standard deep network trained with backpropagation, despite only communicating discrete values between layers. ", "keywords": "online learning;spiking networks;deep learning;temporal", "primary_area": "", "supplementary_material": "", "author": "Peter O'Connor;Efstratios Gavves;Matthias Reisser;Max Welling", "authorids": "peter.ed.oconnor@gmail.com;e.gavves@uva.nl;reisser.matthias@gmail.com;m.welling@uva.nl", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\no'connor2018temporally,\ntitle={Temporally Efficient Deep Learning with Spikes},\nauthor={Peter O'Connor and Efstratios Gavves and Matthias Reisser and Max Welling},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HkZy-bW0-},\n}", "github": "[![github](/images/github_icon.svg) petered/pdnn](https://github.com/petered/pdnn)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;5;4", "rating_avg": 7.0, "confidence_avg": 4.333333333333333, "replies_avg": 6, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10962726962539033469&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=HkZy-bW0-", "pdf": "https://openreview.net/pdf?id=HkZy-bW0-", "email": ";;;", "author_num": 4 }, { "id": "Hk__kGbCW", "title": "DENSELY CONNECTED RECURRENT NEURAL NETWORK FOR SEQUENCE-TO-SEQUENCE LEARNING", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep neural networks based sequence-to-sequence learning has achieved remarkable progress in applications like machine translation and text summarization. However, sequence-to-sequence models suffer from severe inefficiency in training process, requiring huge amount of training time as well as memory usage. In this work, inspired by densely connected layers in modern convolutional neural network, we introduce densely connected sequence-to-sequence learning mechanism to tackle this challenge. In this mechanism, multiple layers of representations from stacked recurrent neural networks are concatenated to enhance feature reuse. Furthermore, a densely connected attention model is elaborately leveraged to improve information flow with more efficient parameter usage via multi-branch structure and local sparsity. We show that such a densely connected mechanism significantly reduces training time and memory usage for sequence-to-sequence learning. In particular, in WMT-14 English-French translation task with a subset of 12M training data, it takes half of training time and model parameters to achieve similar BLEU as typical stacked LSTM models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fei Tian", "authorids": "", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\ntian2018densely,\ntitle={{DENSELY} {CONNECTED} {RECURRENT} {NEURAL} {NETWORK} {FOR} {SEQUENCE}-{TO}-{SEQUENCE} {LEARNING}},\nauthor={Fei Tian},\nyear={2018},\nurl={https://openreview.net/forum?id=Hk__kGbCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Hk__kGbCW", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 1, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jbuYg7oVKFMJ:scholar.google.com/&scioq=DENSELY+CONNECTED+RECURRENT+NEURAL+NETWORK+FOR+SEQUENCE-TO-SEQUENCE+LEARNING&hl=en&as_sdt=0,5", "gs_version_total": 2 }, { "id": "HkanP0lRW", "title": "Data-driven Feature Sampling for Deep Hyperspectral Classification and Segmentation", "track": "main", "status": "Reject", "tldr": "We applied deep learning techniques to hyperspectral image segmentation and iterative feature sampling.", "abstract": "The high dimensionality of hyperspectral imaging forces unique challenges in scope, size and processing requirements. Motivated by the potential for an in-the-field cell sorting detector, we examine a Synechocystis sp. PCC 6803 dataset wherein cells are grown alternatively in nitrogen rich or deplete cultures. We use deep learning techniques to both successfully classify cells and generate a mask segmenting the cells/condition from the background. Further, we use the classification accuracy to guide a data-driven, iterative feature selection method, allowing the design neural networks requiring 90% fewer input features with little accuracy degradation.", "keywords": "Applied deep learning;Image segmentation;Hyperspectral Imaging;Feature sampling", "primary_area": "", "supplementary_material": "", "author": "William M. Severa;Jerilyn A. Timlin;Suraj Kholwadwala;Conrad D. James;James B. Aimone", "authorids": "wmsever@sandia.gov;jatimli@sandia.gov;skholwadwala@gmail.com;cdjame@sandia.gov;jbaimon@sandia.gov", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nm.2018datadriven,\ntitle={Data-driven Feature Sampling for Deep Hyperspectral Classification and Segmentation},\nauthor={William M. Severa and Jerilyn A. Timlin and Suraj Kholwadwala and Conrad D. James and James B. Aimone},\nyear={2018},\nurl={https://openreview.net/forum?id=HkanP0lRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HkanP0lRW", "pdf_size": 0, "rating": "3;4;6", "confidence": "5;5;5", "rating_avg": 4.333333333333333, "confidence_avg": 5.0, "replies_avg": 4, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:K4AaX6Za6w0J:scholar.google.com/&scioq=Data-driven+Feature+Sampling+for+Deep+Hyperspectral+Classification+and+Segmentation&hl=en&as_sdt=0,33", "gs_version_total": 4 }, { "id": "HkbJTYyAb", "title": "Convolutional Normalizing Flows", "track": "main", "status": "Reject", "tldr": "", "abstract": "Bayesian posterior inference is prevalent in various machine learning problems. Variational inference provides one way to approximate the posterior distribution, however its expressive power is limited and so is the accuracy of resulting approximation. Recently, there has a trend of using neural networks to approximate the variational posterior distribution due to the flexibility of neural network architecture. One way to construct flexible variational distribution is to warp a simple density into a complex by normalizing flows, where the resulting density can be analytically evaluated. However, there is a trade-off between the flexibility of normalizing flow and computation cost for efficient transformation. In this paper, we propose a simple yet effective architecture of normalizing flows, ConvFlow, based on convolution over the dimensions of random input vector. Experiments on synthetic and real world posterior inference problems demonstrate the effectiveness and efficiency of the proposed method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guoqing Zheng;Yiming Yang;Jaime Carbonell", "authorids": "gzheng@cs.cmu.edu;yiming@cs.cmu.edu;jgc@cs.cmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzheng2018convolutional,\ntitle={Convolutional Normalizing Flows},\nauthor={Guoqing Zheng and Yiming Yang and Jaime Carbonell},\nyear={2018},\nurl={https://openreview.net/forum?id=HkbJTYyAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HkbJTYyAb", "pdf_size": 0, "rating": "3;3;5", "confidence": "5;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7858357715661597310&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Spherical CNNs", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/144", "id": "Hkbd5xZRb", "author_site": "Taco Cohen, Mario Geiger, Jonas Koehler, Max Welling", "tldr": "We introduce Spherical CNNs, a convolutional network for spherical signals, and apply it to 3D model recognition and molecular energy regression.", "abstract": "Convolutional Neural Networks (CNNs) have become the method of choice for learning problems involving 2D planar images. However, a number of problems of recent interest have created a demand for models that can analyze spherical images. Examples include omnidirectional vision for drones, robots, and autonomous cars, molecular regression problems, and global weather and climate modelling. A naive application of convolutional networks to a planar projection of the spherical signal is destined to fail, because the space-varying distortions introduced by such a projection will make translational weight sharing ineffective.\n\nIn this paper we introduce the building blocks for constructing spherical CNNs. We propose a definition for the spherical cross-correlation that is both expressive and rotation-equivariant. The spherical correlation satisfies a generalized Fourier theorem, which allows us to compute it efficiently using a generalized (non-commutative) Fast Fourier Transform (FFT) algorithm. We demonstrate the computational efficiency, numerical accuracy, and effectiveness of spherical CNNs applied to 3D model recognition and atomization energy regression.", "keywords": "deep learning;equivariance;convolution;group convolution;3D;vision;omnidirectional;shape recognition;molecular energy regression", "primary_area": "", "supplementary_material": "", "author": "Taco S. Cohen;Mario Geiger;Jonas K\u00f6hler;Max Welling", "authorids": "taco.cohen@gmail.com;geiger.mario@gmail.com;jonas.koehler.ks@gmail.com;m.welling@uva.nl", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ns.2018spherical,\ntitle={Spherical {CNN}s},\nauthor={Taco S. Cohen and Mario Geiger and Jonas K\u00f6hler and Max Welling},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Hkbd5xZRb},\n}", "github": "[![github](/images/github_icon.svg) jonas-koehler/s2cnn](https://github.com/jonas-koehler/s2cnn) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=Hkbd5xZRb)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "7;8;9", "confidence": "3;4;4", "rating_avg": 8.0, "confidence_avg": 3.6666666666666665, "replies_avg": 14, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 1203, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6361332838540502667&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=Hkbd5xZRb", "pdf": "https://openreview.net/pdf?id=Hkbd5xZRb", "email": ";;;", "author_num": 4 }, { "id": "HkbmWqxCZ", "title": "The Mutual Autoencoder: Controlling Information in Latent Code Representations", "track": "main", "status": "Reject", "tldr": "", "abstract": "Variational autoencoders (VAE) learn probabilistic latent variable models by optimizing a bound on the marginal likelihood of the observed data. Beyond providing a good density model a VAE model assigns to each data instance a latent code. In many applications, this latent code provides a useful high-level summary of the observation. However, the VAE may fail to learn a useful representation when the decoder family is very expressive. This is because maximum likelihood does not explicitly encourage useful representations and the latent variable is used only if it helps model the marginal distribution. This makes representation learning with VAEs unreliable. To address this issue, we propose a method for explicitly controlling the amount of information stored in the latent code. Our method can learn codes ranging from independent to nearly deterministic while benefiting from decoder capacity. Thus, we decouple the choice of decoder capacity and the latent code dimensionality from the amount of information stored in the code.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mary Phuong;Max Welling;Nate Kushman;Ryota Tomioka;Sebastian Nowozin", "authorids": "bphuong@ist.ac.at;m.welling@uva.nl;nkushman@microsoft.com;ryoto@microsoft.com;sebastian.nowozin@microsoft.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nphuong2018the,\ntitle={The Mutual Autoencoder: Controlling Information in Latent Code Representations},\nauthor={Mary Phuong and Max Welling and Nate Kushman and Ryota Tomioka and Sebastian Nowozin},\nyear={2018},\nurl={https://openreview.net/forum?id=HkbmWqxCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HkbmWqxCZ", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6297072441817625697&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "A Hierarchical Model for Device Placement", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/140", "id": "Hkc-TeZ0W", "author_site": "Azalia Mirhoseini, Anna Goldie, Hieu Pham, Benoit Steiner, Quoc V Le, Jeff Dean", "tldr": "We introduce a hierarchical model for efficient, end-to-end placement of computational graphs onto hardware devices.", "abstract": "We introduce a hierarchical model for efficient placement of computational graphs onto hardware devices, especially in heterogeneous environments with a mixture of CPUs, GPUs, and other computational devices. Our method learns to assign graph operations to groups and to allocate those groups to available devices. The grouping and device allocations are learned jointly. The proposed method is trained with policy gradient and requires no human intervention. Experiments with widely-used\ncomputer vision and natural language models show that our algorithm can find optimized, non-trivial placements for TensorFlow computational graphs with over 80,000 operations. In addition, our approach outperforms placements by human\nexperts as well as a previous state-of-the-art placement method based on deep reinforcement learning. Our method achieves runtime reductions of up to 60.6% per training step when applied to models such as Neural Machine Translation.", "keywords": "deep learning;device placement;policy gradient optimization", "primary_area": "", "supplementary_material": "", "author": "Azalia Mirhoseini;Anna Goldie;Hieu Pham;Benoit Steiner;Quoc V. Le;Jeff Dean", "authorids": "azalia@google.com;agoldie@google.com;hyhieu@cmu.edu;bsteiner@google.com;qvl@google.com;jeff@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nmirhoseini2018a,\ntitle={A Hierarchical Model for Device Placement},\nauthor={Azalia Mirhoseini and Anna Goldie and Hieu Pham and Benoit Steiner and Quoc V. Le and Jeff Dean},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Hkc-TeZ0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;5;8", "confidence": "4;4;5", "rating_avg": 6.0, "confidence_avg": 4.333333333333333, "replies_avg": 11, "authors#_avg": 6, "corr_rating_confidence": 1.0, "gs_citation": 210, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17910495902735510497&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 13, "openreview": "https://openreview.net/forum?id=Hkc-TeZ0W", "pdf": "https://openreview.net/pdf?id=Hkc-TeZ0W", "email": ";;;;;", "author_num": 6 }, { "id": "HkcTe-bR-", "title": "Exploring Deep Recurrent Models with Reinforcement Learning for Molecule Design", "track": "main", "status": "Workshop", "tldr": "We investigate a variety of RL algorithms for molecular generation and define new benchmarks (to be released as an OpenAI Gym), finding PPO and a hill-climbing MLE algorithm work best.", "abstract": "The design of small molecules with bespoke properties is of central importance to drug discovery. However significant challenges yet remain for computational methods, despite recent advances such as deep recurrent networks and reinforcement learning strategies for sequence generation, and it can be difficult to compare results across different works. This work proposes 19 benchmarks selected by subject experts, expands smaller datasets previously used to approximately 1.1 million training molecules, and explores how to apply new reinforcement learning techniques effectively for molecular design. The benchmarks here, built as OpenAI Gym environments, will be open-sourced to encourage innovation in molecular design algorithms and to enable usage by those without a background in chemistry. Finally, this work explores recent development in reinforcement-learning methods with excellent sample complexity (the A2C and PPO algorithms) and investigates their behavior in molecular generation, demonstrating significant performance gains compared to standard reinforcement learning techniques.", "keywords": "reinforcement learning;molecule design;de novo design;ppo;sample-efficient reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Daniel Neil;Marwin Segler;Laura Guasch;Mohamed Ahmed;Dean Plumbley;Matthew Sellwood;Nathan Brown", "authorids": "daniel.neil@benevolent.ai;marwin.segler@benevolent.ai;laura.guasch@benevolent.ai;mohamed.ahmed@benevolent.ai;dean.plumbley@benevolent.ai;matthew.sellwood@benevolent.ai;nathan.brown@benevolent.ai", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nneil2018exploring,\ntitle={Exploring Deep Recurrent Models with Reinforcement Learning for Molecule Design},\nauthor={Daniel Neil and Marwin Segler and Laura Guasch and Mohamed Ahmed and Dean Plumbley and Matthew Sellwood and Nathan Brown},\nyear={2018},\nurl={https://openreview.net/forum?id=HkcTe-bR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HkcTe-bR-", "pdf_size": 0, "rating": "4;6;7", "confidence": "2;3;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.0, "replies_avg": 11, "authors#_avg": 7, "corr_rating_confidence": 0.9819805060619659, "gs_citation": 101, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4864809591239422911&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "HkeJVllRW", "title": "Sparse-Complementary Convolution for Efficient Model Utilization on CNNs", "track": "main", "status": "Reject", "tldr": "TL;DR", "abstract": "We introduce an efficient way to increase the accuracy of convolution neural networks (CNNs) based on high model utilization without increasing any computational complexity.\nThe proposed sparse-complementary convolution replaces regular convolution with sparse and complementary shapes of kernels, covering the same receptive field. \nBy the nature of deep learning, high model utilization of a CNN can be achieved with more simpler kernels rather than fewer complex kernels.\nThis simple but insightful model reuses of recent network architectures, ResNet and DenseNet, can provide better accuracy for most classification tasks (CIFAR-10/100 and ImageNet) compared to their baseline models. By simply replacing the convolution of a CNN with our sparse-complementary convolution, at the same FLOPs and parameters, we can improve top-1 accuracy on ImageNet by 0.33% and 0.18% for ResNet-101 and ResNet-152, respectively. A similar accuracy improvement could be gained by increasing the number of layers in those networks by ~1.5x.", "keywords": "CNN;sparse convolution;sparse kernel;sparsity;model utilization;image classification", "primary_area": "", "supplementary_material": "", "author": "Chun-Fu (Richard) Chen;Jinwook Oh;Quanfu Fan;Marco Pistoia;Gwo Giun (Chris) Lee", "authorids": "chenrich@us.ibm.com;ohj@us.ibm.com;qfan@us.ibm.com;pistoia@us.ibm.com;clee@mail.ncku.edu.tw", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\n(richard)2018sparsecomplementary,\ntitle={Sparse-Complementary Convolution for Efficient Model Utilization on {CNN}s},\nauthor={Chun-Fu (Richard) Chen and Jinwook Oh and Quanfu Fan and Marco Pistoia and Gwo Giun (Chris) Lee},\nyear={2018},\nurl={https://openreview.net/forum?id=HkeJVllRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HkeJVllRW", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;5", "rating_avg": 5.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 9, "authors#_avg": 5, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:v3dSFoTN9SQJ:scholar.google.com/&scioq=Sparse-Complementary+Convolution+for+Efficient+Model+Utilization+on+CNNs&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HkepKG-Rb", "title": "A Semantic Loss Function for Deep Learning with Symbolic Knowledge", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper develops a novel methodology for using symbolic knowledge in deep learning. From first principles, we derive a semantic loss function that bridges between neural output vectors and logical constraints. This loss function captures how close the neural network is to satisfying the constraints on its output. An experimental evaluation shows that our semantic loss function effectively guides the learner to achieve (near-)state-of-the-art results on semi-supervised multi-class classification. Moreover, it significantly increases the ability of the neural network to predict structured objects, such as rankings and shortest paths. These discrete concepts are tremendously difficult to learn, and benefit from a tight integration of deep learning and symbolic reasoning methods.", "keywords": "deep learning;symbolic knowledge;semi-supervised learning;constraints", "primary_area": "", "supplementary_material": "", "author": "Jingyi Xu;Zilu Zhang;Tal Friedman;Yitao Liang;Guy Van den Broeck", "authorids": "jixu@g.ucla.edu;zhangzilu@pku.edu.cn;tal@cs.ucla.edu;yliang@cs.ucla.edu;guyvdb@cs.ucla.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nxu2018a,\ntitle={A Semantic Loss Function for Deep Learning with Symbolic Knowledge},\nauthor={Jingyi Xu and Zilu Zhang and Tal Friedman and Yitao Liang and Guy Van den Broeck},\nyear={2018},\nurl={https://openreview.net/forum?id=HkepKG-Rb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HkepKG-Rb", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;3;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 7, "authors#_avg": 5, "corr_rating_confidence": -0.7559289460184545, "gs_citation": 617, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2687938736648965063&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 16 }, { "title": "Neural Sketch Learning for Conditional Program Generation", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/89", "id": "HkfXMz-Ab", "author_site": "Vijayaraghavan Murali, Letao Qi, Swarat Chaudhuri, Chris Jermaine", "tldr": "We give a method for generating type-safe programs in a Java-like language, given a small amount of syntactic information about the desired code.", "abstract": "We study the problem of generating source code in a strongly typed,\nJava-like programming language, given a label (for example a set of\nAPI calls or types) carrying a small amount of information about the\ncode that is desired. The generated programs are expected to respect a\n`\"realistic\" relationship between programs and labels, as exemplified\nby a corpus of labeled programs available during training.\n\nTwo challenges in such *conditional program generation* are that\nthe generated programs must satisfy a rich set of syntactic and\nsemantic constraints, and that source code contains many low-level\nfeatures that impede learning. We address these problems by training\na neural generator not on code but on *program sketches*, or\nmodels of program syntax that abstract out names and operations that\ndo not generalize across programs. During generation, we infer a\nposterior distribution over sketches, then concretize samples from\nthis distribution into type-safe programs using combinatorial\ntechniques. We implement our ideas in a system for generating\nAPI-heavy Java code, and show that it can often predict the entire\nbody of a method given just a few API calls or data types that appear\nin the method.", "keywords": "Program generation;Source code;Program synthesis;Deep generative models", "primary_area": "", "supplementary_material": "", "author": "Vijayaraghavan Murali;Letao Qi;Swarat Chaudhuri;Chris Jermaine", "authorids": "vijay@rice.edu;letao.qi@rice.edu;swarat@rice.edu;cmj4@rice.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nmurali2018neural,\ntitle={Neural Sketch Learning for Conditional Program Generation},\nauthor={Vijayaraghavan Murali and Letao Qi and Swarat Chaudhuri and Chris Jermaine},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HkfXMz-Ab},\n}", "github": "[![github](/images/github_icon.svg) capergroup/bayou](https://github.com/capergroup/bayou)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "3;2;4", "rating_avg": 7.333333333333333, "confidence_avg": 3.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 164, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11134234129920472875&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HkfXMz-Ab", "pdf": "https://openreview.net/pdf?id=HkfXMz-Ab", "email": ";;;", "author_num": 4 }, { "id": "Hkfmn5n6W", "title": "Exponentially vanishing sub-optimal local minima in multilayer neural networks", "track": "main", "status": "Workshop", "tldr": "\"Bad\" local minima are vanishing in a multilayer neural net: a proof with more reasonable assumptions than before", "abstract": "Background: Statistical mechanics results (Dauphin et al. (2014); Choromanska et al. (2015)) suggest that local minima with high error are exponentially rare in high dimensions. However, to prove low error guarantees for Multilayer Neural Networks (MNNs), previous works so far required either a heavily modified MNN model or training method, strong assumptions on the labels (e.g., \u201cnear\u201d linear separability), or an unrealistically wide hidden layer with \\Omega\\(N) units. \n\nResults: We examine a MNN with one hidden layer of piecewise linear units, a single output, and a quadratic loss. We prove that, with high probability in the limit of N\\rightarrow\\infty datapoints, the volume of differentiable regions of the empiric loss containing sub-optimal differentiable local minima is exponentially vanishing in comparison with the same volume of global minima, given standard normal input of dimension d_0=\\tilde{\\Omega}(\\sqrt{N}), and a more realistic number of d_1=\\tilde{\\Omega}(N/d_0) hidden units. We demonstrate our results numerically: for example, 0% binary classification training error on CIFAR with only N/d_0 = 16 hidden neurons.", "keywords": "neural networks;theory;optimization;local minima;loss landscape", "primary_area": "", "supplementary_material": "", "author": "Daniel Soudry;Elad Hoffer", "authorids": "daniel.soudry@gmail.com;elad.hoffer@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nsoudry2018exponentially,\ntitle={Exponentially vanishing sub-optimal local minima in multilayer neural networks},\nauthor={Daniel Soudry and Elad Hoffer},\nyear={2018},\nurl={https://openreview.net/forum?id=Hkfmn5n6W},\n}", "github": "[![github](/images/github_icon.svg) MNNsMinima/Paper](https://github.com/MNNsMinima/Paper)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Hkfmn5n6W", "pdf_size": 0, "rating": "5;6;7", "confidence": "3;3;2", "rating_avg": 6.0, "confidence_avg": 2.6666666666666665, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 110, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16572109840860859919&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "title": "Distributed Fine-tuning of Language Models on Private Data", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/326", "id": "HkgNdt26Z", "author_site": "Vadim Popov, Mikhail Kudinov, Irina Piontkovskaya, Petr Vytovtov, Alex Nevidomsky", "tldr": "We propose a method of distributed fine-tuning of language models on user devices without collection of private data", "abstract": "One of the big challenges in machine learning applications is that training data can be different from the real-world data faced by the algorithm. In language modeling, users\u2019 language (e.g. in private messaging) could change in a year and be completely different from what we observe in publicly available data. At the same time, public data can be used for obtaining general knowledge (i.e. general model of English). We study approaches to distributed fine-tuning of a general model on user private data with the additional requirements of maintaining the quality on the general data and minimization of communication costs. We propose a novel technique that significantly improves prediction quality on users\u2019 language compared to a general model and outperforms gradient compression methods in terms of communication efficiency. The proposed procedure is fast and leads to an almost 70% perplexity reduction and 8.7 percentage point improvement in keystroke saving rate on informal English texts. Finally, we propose an experimental framework for evaluating differential privacy of distributed training of language models and show that our approach has good privacy guarantees.", "keywords": "distributed training;federated learning;language modeling;differential privacy", "primary_area": "", "supplementary_material": "", "author": "Vadim Popov;Mikhail Kudinov;Irina Piontkovskaya;Petr Vytovtov;Alex Nevidomsky", "authorids": "v.popov@samsung.com;m.kudinov@samsung.com;p.irina@samsung.com;p.vytovtov@partner.samsung.com;a.nevidomsky@samsung.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\npopov2018distributed,\ntitle={Distributed Fine-tuning of Language Models on Private Data},\nauthor={Vadim Popov and Mikhail Kudinov and Irina Piontkovskaya and Petr Vytovtov and Alex Nevidomsky},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HkgNdt26Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;3;4", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 5, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2477607251805585860&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=HkgNdt26Z", "pdf": "https://openreview.net/pdf?id=HkgNdt26Z", "email": ";;;;", "author_num": 5 }, { "id": "Hki-ZlbA-", "title": "Ground-Truth Adversarial Examples", "track": "main", "status": "Reject", "tldr": "We use formal verification to assess the effectiveness of techniques for finding adversarial examples or for defending against adversarial examples.", "abstract": "The ability to deploy neural networks in real-world, safety-critical systems is severely limited by the presence of adversarial examples: slightly perturbed inputs that are misclassified by the network. In recent years, several techniques have been proposed for training networks that are robust to such examples; and each time stronger attacks have been devised, demonstrating the shortcomings of existing defenses. This highlights a key difficulty in designing an effective defense: the inability to assess a network's robustness against future attacks. We propose to address this difficulty through formal verification techniques. We construct ground truths: adversarial examples with a provably-minimal distance from a given input point. We demonstrate how ground truths can serve to assess the effectiveness of attack techniques, by comparing the adversarial examples produced by those attacks to the ground truths; and also of defense techniques, by computing the distance to the ground truths before and after the defense is applied, and measuring the improvement. We use this technique to assess recently suggested attack and defense techniques.\n", "keywords": "adversarial examples;neural networks;formal verification;ground truths", "primary_area": "", "supplementary_material": "", "author": "Nicholas Carlini;Guy Katz;Clark Barrett;David L. Dill", "authorids": "nicholas@carlini.com;katz911@gmail.com;barrett@cs.stanford.edu;dill@cs.stanford.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ncarlini2018groundtruth,\ntitle={Ground-Truth Adversarial Examples},\nauthor={Nicholas Carlini and Guy Katz and Clark Barrett and David L. Dill},\nyear={2018},\nurl={https://openreview.net/forum?id=Hki-ZlbA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Hki-ZlbA-", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;3", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 96, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11978123379984150991&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "HkinqfbAb", "title": "Automatic Parameter Tying in Neural Networks", "track": "main", "status": "Reject", "tldr": "A k-means prior combined with L1 regularization yields state-of-the-art compression results.", "abstract": "Recently, there has been growing interest in methods that perform neural network compression, namely techniques that attempt to substantially reduce the size of a neural network without significant reduction in performance. However, most existing methods are post-processing approaches in that they take a learned neural network as input and output a compressed network by either forcing several parameters to take the same value (parameter tying via quantization) or pruning irrelevant edges (pruning) or both. In this paper, we propose a novel algorithm that jointly learns and compresses a neural network. The key idea in our approach is to change the optimization criteria by adding $k$ independent Gaussian priors over the parameters and a sparsity penalty. We show that our approach is easy to implement using existing neural network libraries, generalizes L1 and L2 regularization and elegantly enforces parameter tying as well as pruning constraints. Experimentally, we demonstrate that our new algorithm yields state-of-the-art compression on several standard benchmarks with minimal loss in accuracy while requiring little to no hyperparameter tuning as compared with related, competing approaches. ", "keywords": "neural network;quantization;compression", "primary_area": "", "supplementary_material": "", "author": "Yibo Yang;Nicholas Ruozzi;Vibhav Gogate", "authorids": "yibo.yang@utdallas.edu;nicholas.ruozzi@utdallas.edu;vgogate@hlt.utdallas.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nyang2018automatic,\ntitle={Automatic Parameter Tying in Neural Networks},\nauthor={Yibo Yang and Nicholas Ruozzi and Vibhav Gogate},\nyear={2018},\nurl={https://openreview.net/forum?id=HkinqfbAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HkinqfbAb", "pdf_size": 0, "rating": "6;6;6", "confidence": "5;4;4", "rating_avg": 6.0, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dna2Gi4Gg-cJ:scholar.google.com/&scioq=Automatic+Parameter+Tying+in+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HkjL6MiTb", "title": "Siamese Survival Analysis with Competing Risks", "track": "main", "status": "Reject", "tldr": "In this work we introduce a novel Siamese Deep Neural Network architecture that is able to effectively learn from data in the presence of multiple adverse events.", "abstract": "Survival Analysis (time-to-event analysis) in the presence of multiple possible adverse events, i.e., competing risks, is a challenging, yet very important problem in medicine, finance, manufacturing, etc. Extending classical survival analysis to competing risks is not trivial since only one event (e.g. one cause of death) is observed and hence, the incidence of an event of interest is often obscured by other related competing events. This leads to the nonidentifiability of the event times\u2019 distribution parameters, which makes the problem significantly more challenging. In this work we introduce Siamese Survival Prognosis Network, a novel Siamese Deep Neural Network architecture that is able to effectively learn from data in the presence of multiple adverse events. The Siamese Survival Network is especially crafted to issue pairwise concordant time-dependent risks, in which longer event times are assigned lower risks. Furthermore, our architecture is able to directly optimize an approximation to the C-discrimination index, rather than relying on well-known metrics of cross-entropy etc., and which are not able to capture the unique requirements of survival analysis with competing risks. Our results show consistent performance improvements on a number of publicly available medical datasets over both statistical and deep learning state-of-the-art methods.", "keywords": "survival analysis;competing risks;siamese neural networks", "primary_area": "", "supplementary_material": "", "author": "Anton Nemchenko;Kartik Ahuja;Mihaela Van Der Schaar", "authorids": "santon834@g.ucla.edu;ahujak@ucla.edu;mihaela@ee.ucla.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nnemchenko2018siamese,\ntitle={Siamese Survival Analysis with Competing Risks},\nauthor={Anton Nemchenko and Kartik Ahuja and Mihaela Van Der Schaar},\nyear={2018},\nurl={https://openreview.net/forum?id=HkjL6MiTb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HkjL6MiTb", "pdf_size": 0, "rating": "4;4;4", "confidence": "5;4;4", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11544511116411707917&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "HklZOfW0W", "title": "UPS: optimizing Undirected Positive Sparse graph for neural graph filtering", "track": "main", "status": "Reject", "tldr": "Graph Optimization with signal filtering in the vertex domain.", "abstract": "In this work we propose a novel approach for learning graph representation of the data using gradients obtained via backpropagation. Next we build a neural network architecture compatible with our optimization approach and motivated by graph filtering in the vertex domain. We demonstrate that the learned graph has richer structure than often used nearest neighbors graphs constructed based on features similarity. Our experiments demonstrate that we can improve prediction quality for several convolution on graphs architectures, while others appeared to be insensitive to the input graph.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mikhail Yurochkin;Dung Thai;Hung Hai Bui;XuanLong Nguyen", "authorids": "moonfolk@umich.edu;dthai@iesl.cs.umass.edu;bui.h.hung@gmail.com;xuanlong@umich.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nyurochkin2018ups,\ntitle={{UPS}: optimizing Undirected Positive Sparse graph for neural graph filtering},\nauthor={Mikhail Yurochkin and Dung Thai and Hung Hai Bui and XuanLong Nguyen},\nyear={2018},\nurl={https://openreview.net/forum?id=HklZOfW0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HklZOfW0W", "pdf_size": 0, "rating": "3;4;6", "confidence": "3;3;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MaI67emyfDkJ:scholar.google.com/&scioq=UPS:+optimizing+Undirected+Positive+Sparse+graph+for+neural+graph+filtering&hl=en&as_sdt=0,33", "gs_version_total": 2 }, { "id": "HklpCzC6-", "title": "Image Segmentation by Iterative Inference from Conditional Score Estimation", "track": "main", "status": "Reject", "tldr": "Refining segmentation proposals by performing iterative inference with conditional denoising autoencoders.", "abstract": "Inspired by the combination of feedforward and iterative computations in the visual cortex, and taking advantage of the ability of denoising autoencoders to estimate the score of a joint distribution, we propose a novel approach to iterative inference for capturing and exploiting the complex joint distribution of output variables conditioned on some input variables. This approach is applied to image pixel-wise segmentation, with the estimated conditional score used to perform gradient ascent towards a mode of the estimated conditional distribution. This extends previous work on score estimation by denoising autoencoders to the case of a conditional distribution, with a novel use of a corrupted feedforward predictor replacing Gaussian corruption. An advantage of this approach over more classical ways to perform iterative inference for structured outputs, like conditional random fields (CRFs), is that it is not any more necessary to define an explicit energy function linking the output variables. To keep computations tractable, such energy function parametrizations are typically fairly constrained, involving only a few neighbors of each of the output variables in each clique. We experimentally find that the proposed iterative inference from conditional score estimation by conditional denoising autoencoders performs better than comparable models based on CRFs or those not using any explicit modeling of the conditional joint distribution of outputs.", "keywords": "semantic segmentation;conditional denoising autoencoders;iterative inference", "primary_area": "", "supplementary_material": "", "author": "Adriana Romero;Michal Drozdzal;Akram Erraqabi;Simon J\u00e9gou;Yoshua Bengio", "authorids": "adriana.romsor@gmail.com;michal.drozdzal@gmail.com;akram.er-raqabi@umontreal.ca;simon.jegou@gmail.com;yoshua.umontreal@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nromero2018image,\ntitle={Image Segmentation by Iterative Inference from Conditional Score Estimation},\nauthor={Adriana Romero and Michal Drozdzal and Akram Erraqabi and Simon J\u00e9gou and Yoshua Bengio},\nyear={2018},\nurl={https://openreview.net/forum?id=HklpCzC6-},\n}", "github": "[![github](/images/github_icon.svg) adri-romsor/iterative_inference_segm](https://github.com/adri-romsor/iterative_inference_segm)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HklpCzC6-", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;5", "rating_avg": 4.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 5, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=371132795826433934&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "HkmaTz-0W", "title": "Visualizing the Loss Landscape of Neural Nets", "track": "main", "status": "Workshop", "tldr": "We explore the structure of neural loss functions, and the effect of loss landscapes on generalization, using a range of visualization methods.", "abstract": "Neural network training relies on our ability to find ````````\"good\" minimizers of highly non-convex loss functions. It is well known that certain network architecture designs (e.g., skip connections) produce loss functions that train easier, and well-chosen training parameters (batch size, learning rate, optimizer) produce minimizers that generalize better. However, the reasons for these differences, and their effect on the underlying loss landscape, is not well understood.\n\nIn this paper, we explore the structure of neural loss functions, and the effect of loss landscapes on generalization, using a range of visualization methods. First, we introduce a simple ``\"filter normalization\" method that helps us visualize loss function curvature, and make meaningful side-by-side comparisons between loss functions. Then, using a variety of visualizations, we explore how network architecture effects the loss landscape, and how training parameters affect the shape of minimizers.", "keywords": "visualization;loss surface;flatness;sharpness", "primary_area": "", "supplementary_material": "", "author": "Hao Li;Zheng Xu;Gavin Taylor;Tom Goldstein", "authorids": "haoli@cs.umd.edu;xuzh@cs.umd.edu;taylor@usna.edu;tomg@cs.umd.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nli2018visualizing,\ntitle={Visualizing the Loss Landscape of Neural Nets},\nauthor={Hao Li and Zheng Xu and Gavin Taylor and Tom Goldstein},\nyear={2018},\nurl={https://openreview.net/forum?id=HkmaTz-0W},\n}", "github": "[![github](/images/github_icon.svg) tomgoldstein/loss-landscape](https://github.com/tomgoldstein/loss-landscape) + [![Papers with Code](/images/pwc_icon.svg) 10 community implementations](https://paperswithcode.com/paper/?openreview=HkmaTz-0W)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HkmaTz-0W", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 2486, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11650483902238288010&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15 }, { "title": "Learning how to explain neural networks: PatternNet and PatternAttribution", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/322", "id": "Hkn7CBaTW", "author_site": "Pieter-Jan Kindermans, Kristof T Sch\u00fctt, Maximilian Alber, Klaus R Muller, Dumitru Erhan, Been Kim, Sven D\u00e4hne", "tldr": "Without learning, it is impossible to explain a machine learning model's decisions.", "abstract": "DeConvNet, Guided BackProp, LRP, were invented to better understand deep neural networks. We show that these methods do not produce the theoretically correct explanation for a linear model. Yet they are used on multi-layer networks with millions of parameters. This is a cause for concern since linear models are simple neural networks. We argue that explanation methods for neural nets should work reliably in the limit of simplicity, the linear models. Based on our analysis of linear models we propose a generalization that yields two explanation techniques (PatternNet and PatternAttribution) that are theoretically sound for linear models and produce improved explanations for deep networks.\n", "keywords": "machine learning;interpretability;deep learning", "primary_area": "", "supplementary_material": "", "author": "Pieter-Jan Kindermans;Kristof T. Sch\u00fctt;Maximilian Alber;Klaus-Robert M\u00fcller;Dumitru Erhan;Been Kim;Sven D\u00e4hne", "authorids": "pikinder@google.com;kristof.schuett@tu-berlin.de;maximilian.aber@tu-berlin.de;klaus-robert.mueller@tu-berlin.de;dumitru@google.com;beenkim@google.com;sven.daehne@tu-berlin.de", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nkindermans2018learning,\ntitle={Learning how to explain neural networks: PatternNet and PatternAttribution},\nauthor={Pieter-Jan Kindermans and Kristof T. Sch\u00fctt and Maximilian Alber and Klaus-Robert M\u00fcller and Dumitru Erhan and Been Kim and Sven D\u00e4hne},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Hkn7CBaTW},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=Hkn7CBaTW)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;8;8", "confidence": "4;3;4", "rating_avg": 7.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 7, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 433, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2569380699011746948&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=Hkn7CBaTW", "pdf": "https://openreview.net/pdf?id=Hkn7CBaTW", "email": ";;;;;;", "author_num": 7 }, { "id": "HknbyQbC-", "title": "Generating Adversarial Examples with Adversarial Networks", "track": "main", "status": "Reject", "tldr": "We propose to generate adversarial example based on generative adversarial networks in a semi-whitebox and black-box settings.", "abstract": "Deep neural networks (DNNs) have been found to be vulnerable to adversarial examples resulting from adding small-magnitude perturbations to inputs. Such adversarial examples can mislead DNNs to produce adversary-selected results.\nDifferent attack strategies have been proposed to generate adversarial examples, but how to produce them with high perceptual quality and more efficiently requires more research efforts. \nIn this paper, we propose AdvGAN to generate adversarial examples with generative adversarial networks (GANs), which can learn and approximate the distribution of original instances. \nFor AdvGAN, once the generator is trained, it can generate adversarial perturbations efficiently for any instance, so as to potentially accelerate adversarial training as defenses. \nWe apply AdvGAN in both semi-whitebox and black-box attack settings. In semi-whitebox attacks, there is no need to access the original target model after the generator is trained, in contrast to traditional white-box attacks. In black-box attacks, we dynamically train a distilled model for the black-box model and optimize the generator accordingly.\nAdversarial examples generated by AdvGAN on different target models have high attack success rate under state-of-the-art defenses compared to other attacks. Our attack has placed the first with 92.76% accuracy on a public MNIST black-box attack challenge. ", "keywords": "adversarial examples;generative adversarial network;black-box attack", "primary_area": "", "supplementary_material": "", "author": "Chaowei Xiao;Bo Li;Jun-Yan Zhu;Warren He;Mingyan Liu;Dawn Song", "authorids": "xiaocw@umich.edu;lxbosky@gmail.com;junyanz@berkeley.edu;_w@eecs.berkeley.edu;mingyan@umich.edu;dawnsong.travel@gmail.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nxiao2018generating,\ntitle={Generating Adversarial Examples with Adversarial Networks},\nauthor={Chaowei Xiao and Bo Li and Jun-Yan Zhu and Warren He and Mingyan Liu and Dawn Song},\nyear={2018},\nurl={https://openreview.net/forum?id=HknbyQbC-},\n}", "github": "[![github](/images/github_icon.svg) MadryLab/mnist_challenge](https://github.com/MadryLab/mnist_challenge) + [![Papers with Code](/images/pwc_icon.svg) 9 community implementations](https://paperswithcode.com/paper/?openreview=HknbyQbC-)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HknbyQbC-", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;4;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 22, "authors#_avg": 6, "corr_rating_confidence": -0.7559289460184545, "gs_citation": 1148, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12845583525649303247&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10 }, { "title": "Monotonic Chunkwise Attention", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/211", "id": "Hko85plCW", "author_site": "Chung-Cheng Chiu, Colin Raffel", "tldr": "An online and linear-time attention mechanism that performs soft attention over adaptively-located chunks of the input sequence.", "abstract": "Sequence-to-sequence models with soft attention have been successfully applied to a wide variety of problems, but their decoding process incurs a quadratic time and space cost and is inapplicable to real-time sequence transduction. To address these issues, we propose Monotonic Chunkwise Attention (MoChA), which adaptively splits the input sequence into small chunks over which soft attention is computed. We show that models utilizing MoChA can be trained efficiently with standard backpropagation while allowing online and linear-time decoding at test time. When applied to online speech recognition, we obtain state-of-the-art results and match the performance of a model using an offline soft attention mechanism. In document summarization experiments where we do not expect monotonic alignments, we show significantly improved performance compared to a baseline monotonic attention-based model.", "keywords": "attention;sequence-to-sequence;speech recognition;document summarization", "primary_area": "", "supplementary_material": "", "author": "Chung-Cheng Chiu*;Colin Raffel*", "authorids": "chungchengc@google.com;craffel@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nchiu*2018monotonic,\ntitle={Monotonic Chunkwise Attention},\nauthor={Chung-Cheng Chiu* and Colin Raffel*},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Hko85plCW},\n}", "github": "[![github](/images/github_icon.svg) craffel/mocha](https://github.com/craffel/mocha)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;5;4", "rating_avg": 7.0, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 319, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6779239600518198216&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=Hko85plCW", "pdf": "https://openreview.net/pdf?id=Hko85plCW", "email": ";", "author_num": 2 }, { "id": "Hkp3uhxCW", "title": "Revisiting Bayes by Backprop", "track": "main", "status": "Reject", "tldr": " Variational Bayes scheme for Recurrent Neural Networks", "abstract": "In this work we explore a straightforward variational Bayes scheme for Recurrent Neural Networks.\nFirstly, we show that a simple adaptation of truncated backpropagation through time can yield good quality uncertainty estimates and superior regularisation at only a small extra computational cost during training, also reducing the amount of parameters by 80\\%.\nSecondly, we demonstrate how a novel kind of posterior approximation yields further improvements to the performance of Bayesian RNNs. We incorporate local gradient information into the approximate posterior to sharpen it around the current batch statistics. We show how this technique is not exclusive to recurrent neural networks and can be applied more widely to train Bayesian neural networks.\nWe also empirically demonstrate how Bayesian RNNs are superior to traditional RNNs on a language modelling benchmark and an image captioning task, as well as showing how each of these methods improve our model over a variety of other schemes for training them. We also introduce a new benchmark for studying uncertainty for language models so future methods can be easily compared.", "keywords": "Bayesian;Deep Learning;Recurrent Neural Networks;LSTM", "primary_area": "", "supplementary_material": "", "author": "Meire Fortunato;Charles Blundell;Oriol Vinyals", "authorids": "meirefortunato@google.com;cblundell@google.com;vinyals@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nfortunato2018revisiting,\ntitle={Revisiting Bayes by Backprop},\nauthor={Meire Fortunato and Charles Blundell and Oriol Vinyals},\nyear={2018},\nurl={https://openreview.net/forum?id=Hkp3uhxCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Hkp3uhxCW", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;5", "rating_avg": 5.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6538762853552067448&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "HkpRBFxRb", "title": "Learning to Mix n-Step Returns: Generalizing Lambda-Returns for Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "A novel way to generalize lambda-returns by allowing the RL agent to decide how much it wants to weigh each of the n-step returns.", "abstract": "Reinforcement Learning (RL) can model complex behavior policies for goal-directed sequential decision making tasks. A hallmark of RL algorithms is Temporal Difference (TD) learning: value function for the current state is moved towards a bootstrapped target that is estimated using the next state's value function. lambda-returns define the target of the RL agent as a weighted combination of rewards estimated by using multiple many-step look-aheads. Although mathematically tractable, the use of exponentially decaying weighting of n-step returns based targets in lambda-returns is a rather ad-hoc design choice. Our major contribution is that we propose a generalization of lambda-returns called Confidence-based Autodidactic Returns (CAR), wherein the RL agent learns the weighting of the n-step returns in an end-to-end manner. In contrast to lambda-returns wherein the RL agent is restricted to use an exponentially decaying weighting scheme, CAR allows the agent to learn to decide how much it wants to weigh the n-step returns based targets. Our experiments, in addition to showing the efficacy of CAR, also empirically demonstrate that using sophisticated weighted mixtures of multi-step returns (like CAR and lambda-returns) considerably outperforms the use of n-step returns. We perform our experiments on the Asynchronous Advantage Actor Critic (A3C) algorithm in the Atari 2600 domain.", "keywords": "Reinforcement Learning;Lambda-Returns", "primary_area": "", "supplementary_material": "", "author": "Sahil Sharma;Girish Raguvir J *;Srivatsan Ramesh *;Balaraman Ravindran", "authorids": "sahil@cse.iitm.ac.in;girishraguvir@gmail.com;sriramesh4@gmail.com;ravi@cse.iitm.ac.in", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsharma2018learning,\ntitle={Learning to Mix n-Step Returns: Generalizing Lambda-Returns for Deep Reinforcement Learning},\nauthor={Sahil Sharma and Girish Raguvir J * and Srivatsan Ramesh * and Balaraman Ravindran},\nyear={2018},\nurl={https://openreview.net/forum?id=HkpRBFxRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HkpRBFxRb", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;3;4", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12607585771454979902&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "HkpYwMZRb", "title": "Gradients explode - Deep Networks are shallow - ResNet explained", "track": "main", "status": "Workshop", "tldr": "We show that in contras to popular wisdom, the exploding gradient problem has not been solved and that it limits the depth to which MLPs can be effectively trained. We show why gradients explode and how ResNet handles them.", "abstract": "Whereas it is believed that techniques such as Adam, batch normalization and, more recently, SeLU nonlinearities ``solve'' the exploding gradient problem, we show that this is not the case and that in a range of popular MLP architectures, exploding gradients exist and that they limit the depth to which networks can be effectively trained, both in theory and in practice. We explain why exploding gradients occur and highlight the {\\it collapsing domain problem}, which can arise in architectures that avoid exploding gradients. \n\nResNets have significantly lower gradients and thus can circumvent the exploding gradient problem, enabling the effective training of much deeper networks, which we show is a consequence of a surprising mathematical property. By noticing that {\\it any neural network is a residual network}, we devise the {\\it residual trick}, which reveals that introducing skip connections simplifies the network mathematically, and that this simplicity may be the major cause for their success.", "keywords": "deep learning;MLP;ResNet;residual network;exploding gradient problem;vanishing gradient problem;effective depth;batch normalization;covariate shift", "primary_area": "", "supplementary_material": "", "author": "George Philipp;Dawn Song;Jaime G. Carbonell", "authorids": "george.philipp@email.de;dawnsong@gmail.com;jgc@cs.cmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nphilipp2018gradients,\ntitle={Gradients explode - Deep Networks are shallow - ResNet explained},\nauthor={George Philipp and Dawn Song and Jaime G. Carbonell},\nyear={2018},\nurl={https://openreview.net/forum?id=HkpYwMZRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HkpYwMZRb", "pdf_size": 0, "rating": "3;5;8", "confidence": "2;4;1", "rating_avg": 5.333333333333333, "confidence_avg": 2.3333333333333335, "replies_avg": 17, "authors#_avg": 3, "corr_rating_confidence": -0.43355498476206006, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16347061525099758909&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "Hkq4mpM5f", "title": "Embedding Deep Networks into Visual Explanations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this paper, we propose a novel explanation module to explain the predictions made by a deep network. Explanation module works by embedding a high-dimensional deep network layer nonlinearly into a low-dimensional explanation space while retaining faithfulness, so that the original deep learning predictions can be constructed from the few concepts extracted by the explanation module. We then visualize such concepts for human to learn about the high-level concepts that deep learning is using to make decisions. We propose an algorithm called Sparse Reconstruction Autoencoder (SRAE) for learning the embedding to the explanation space. SRAE aims to reconstruct part of the original feature space while retaining faithfulness. A visualization system is then introduced for human understanding of features in the explanation space. The proposed method is applied to explain CNN models in image classification tasks, and several novel metrics are introduced to evaluate the performance of explanations quantitatively without human involvement. Experiments show that the proposed approach could generate better explanations of the mechanisms CNN use for making predictions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhongang Qi;Fuxin Li", "authorids": "qiz@oregonstate.edu;lif@eecs.oregonstate.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=Hkq4mpM5f", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 0, "authors#_avg": 2, "corr_rating_confidence": 0, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15780012350616922962&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "title": "Combining Symbolic Expressions and Black-box Function Evaluations in Neural Programs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/110", "id": "Hksj2WWAW", "author_site": "Forough Arabshahi, Sameer Singh, anima anandkumar", "tldr": "", "abstract": "Neural programming involves training neural networks to learn programs, mathematics, or logic from data. Previous works have failed to achieve good generalization performance, especially on problems and programs with high complexity or on large domains. This is because they mostly rely either on black-box function evaluations that do not capture the structure of the program, or on detailed execution traces that are expensive to obtain, and hence the training data has poor coverage of the domain under consideration. We present a novel framework that utilizes black-box function evaluations, in conjunction with symbolic expressions that define relationships between the given functions. We employ tree LSTMs to incorporate the structure of the symbolic expression trees. We use tree encoding for numbers present in function evaluation data, based on their decimal representation. We present an evaluation benchmark for this task to demonstrate our proposed model combines symbolic reasoning and function evaluation in a fruitful manner, obtaining high accuracies in our experiments. Our framework generalizes significantly better to expressions of higher depth and is able to fill partial equations with valid completions.", "keywords": "symbolic reasoning;mathematical equations;recursive neural networks;neural programing", "primary_area": "", "supplementary_material": "", "author": "Forough Arabshahi;Sameer Singh;Animashree Anandkumar", "authorids": "farabsha@uci.edu;sameer@uci.edu;animakumar@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\narabshahi2018combining,\ntitle={Combining Symbolic Expressions and Black-box Function Evaluations for Training Neural Programs},\nauthor={Forough Arabshahi and Sameer Singh and Animashree Anandkumar},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Hksj2WWAW},\n}", "github": "[![github](/images/github_icon.svg) ForoughA/neuralMath](https://github.com/ForoughA/neuralMath)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;6;8", "confidence": "4;4;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": -0.944911182523068, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12704807079952611027&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=Hksj2WWAW", "pdf": "https://openreview.net/pdf?id=Hksj2WWAW", "email": ";;", "author_num": 3 }, { "id": "HksxTdiWz", "title": "withdrawn", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Paper was withdrawn.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "withdrawn", "authorids": "withdrawn", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HksxTdiWz", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 3, "authors#_avg": 1, "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "Towards Neural Phrase-based Machine Translation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/306", "id": "HktJec1RZ", "author_site": "Po-Sen Huang, Chong Wang, Sitao Huang, Dengyong Zhou, Li Deng", "tldr": "Neural phrase-based machine translation with linear decoding time", "abstract": "In this paper, we present Neural Phrase-based Machine Translation (NPMT). Our method explicitly models the phrase structures in output sequences using Sleep-WAke Networks (SWAN), a recently proposed segmentation-based sequence modeling method. To mitigate the monotonic alignment requirement of SWAN, we introduce a new layer to perform (soft) local reordering of input sequences. Different from existing neural machine translation (NMT) approaches, NPMT does not use attention-based decoding mechanisms. Instead, it directly outputs phrases in a sequential order and can decode in linear time. Our experiments show that NPMT achieves superior performances on IWSLT 2014 German-English/English-German and IWSLT 2015 English-Vietnamese machine translation tasks compared with strong NMT baselines. We also observe that our method produces meaningful phrases in output languages.", "keywords": "Neural Machine Translation;Sequence to Sequence;Sequence Modeling", "primary_area": "", "supplementary_material": "", "author": "Po-Sen Huang;Chong Wang;Sitao Huang;Dengyong Zhou;Li Deng", "authorids": "huang.person@gmail.com;chongw@google.com;shuang91@illinois.edu;dennyzhou@gmail.com;l.deng@ieee.org", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nhuang2018towards,\ntitle={Towards Neural Phrase-based Machine Translation},\nauthor={Po-Sen Huang and Chong Wang and Sitao Huang and Dengyong Zhou and Li Deng},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HktJec1RZ},\n}", "github": "[![github](/images/github_icon.svg) posenhuang/NPMT](https://github.com/posenhuang/NPMT) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=HktJec1RZ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;8", "confidence": "3;4;5", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 5, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 106, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14839462711165509564&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=HktJec1RZ", "pdf": "https://openreview.net/pdf?id=HktJec1RZ", "email": ";;;;", "author_num": 5 }, { "title": "Learning Deep Mean Field Games for Modeling Large Population Behavior", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/268", "id": "HktK4BeCZ", "author_site": "Jiachen Yang, Xiaojing Ye, Rakshit Trivedi, huan xu, Hongyuan Zha", "tldr": "Inference of a mean field game (MFG) model of large population behavior via a synthesis of MFG and Markov decision processes.", "abstract": "We consider the problem of representing collective behavior of large populations and predicting the evolution of a population distribution over a discrete state space. A discrete time mean field game (MFG) is motivated as an interpretable model founded on game theory for understanding the aggregate effect of individual actions and predicting the temporal evolution of population distributions. We achieve a synthesis of MFG and Markov decision processes (MDP) by showing that a special MFG is reducible to an MDP. This enables us to broaden the scope of mean field game theory and infer MFG models of large real-world systems via deep inverse reinforcement learning. Our method learns both the reward function and forward dynamics of an MFG from real data, and we report the first empirical test of a mean field game model of a real-world social media population.", "keywords": "mean field games;reinforcement learning;Markov decision processes;inverse reinforcement learning;deep learning;inverse optimal control;computational social science;population modeling", "primary_area": "", "supplementary_material": "", "author": "Jiachen Yang;Xiaojing Ye;Rakshit Trivedi;Huan Xu;Hongyuan Zha", "authorids": "yjiachen@gmail.com;xye@gsu.edu;rstrivedi@gatech.edu;huan.xu@isye.gatech.edu;zha@cc.gatech.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nyang2018deep,\ntitle={Deep Mean Field Games for Learning Optimal Behavior Policy of Large Populations},\nauthor={Jiachen Yang and Xiaojing Ye and Rakshit Trivedi and Huan Xu and Hongyuan Zha},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HktK4BeCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "8;8;10", "confidence": "4;3;5", "rating_avg": 8.666666666666666, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 5, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 63, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17328831972958604970&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HktK4BeCZ", "pdf": "https://openreview.net/pdf?id=HktK4BeCZ", "email": ";;;;", "author_num": 5 }, { "title": "Polar Transformer Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/263", "id": "HktRlUlAZ", "author_site": "Carlos Esteves, Christine Allen-Blanchette, Xiaowei Zhou, Kostas Daniilidis", "tldr": "We learn feature maps invariant to translation, and equivariant to rotation and scale.", "abstract": "Convolutional neural networks (CNNs) are inherently equivariant to translation. Efforts to embed other forms of equivariance have concentrated solely on rotation. We expand the notion of equivariance in CNNs through the Polar Transformer Network (PTN). PTN combines ideas from the Spatial Transformer Network (STN) and canonical coordinate representations. The result is a network invariant to translation and equivariant to both rotation and scale. PTN is trained end-to-end and composed of three distinct stages: a polar origin predictor, the newly introduced polar transformer module and a classifier. PTN achieves state-of-the-art on rotated MNIST and the newly introduced SIM2MNIST dataset, an MNIST variation obtained by adding clutter and perturbing digits with translation, rotation and scaling. The ideas of PTN are extensible to 3D which we demonstrate through the Cylindrical Transformer Network.", "keywords": "equivariance;invariance;canonical coordinates", "primary_area": "", "supplementary_material": "", "author": "Carlos Esteves;Christine Allen-Blanchette;Xiaowei Zhou;Kostas Daniilidis", "authorids": "machc@seas.upenn.edu;allec@seas.upenn.edu;xiaowz@seas.upenn.edu;kostas@seas.upenn.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nesteves2018polar,\ntitle={Polar Transformer Networks},\nauthor={Carlos Esteves and Christine Allen-Blanchette and Xiaowei Zhou and Kostas Daniilidis},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HktRlUlAZ},\n}", "github": "[![github](/images/github_icon.svg) daniilidis-group/polar-transformer-networks](https://github.com/daniilidis-group/polar-transformer-networks)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;4;3", "rating_avg": 7.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 225, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15618354521274654533&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=HktRlUlAZ", "pdf": "https://openreview.net/pdf?id=HktRlUlAZ", "email": ";;;", "author_num": 4 }, { "id": "HktXuGb0-", "title": "Reward Estimation via State Prediction", "track": "main", "status": "Reject", "tldr": "Reward Estimation from Game Videos", "abstract": "Reinforcement learning typically requires carefully designed reward functions in order to learn the desired behavior. We present a novel reward estimation method that is based on a finite sample of optimal state trajectories from expert demon- strations and can be used for guiding an agent to mimic the expert behavior. The optimal state trajectories are used to learn a generative or predictive model of the \u201cgood\u201d states distribution. The reward signal is computed by a function of the difference between the actual next state acquired by the agent and the predicted next state given by the learned generative or predictive model. With this inferred reward function, we perform standard reinforcement learning in the inner loop to guide the agent to learn the given task. Experimental evaluations across a range of tasks demonstrate that the proposed method produces superior performance compared to standard reinforcement learning with both complete or sparse hand engineered rewards. Furthermore, we show that our method successfully enables an agent to learn good actions directly from expert player video of games such as the Super Mario Bros and Flappy Bird.", "keywords": "reinforcement learning;inverse reinforcement learning;imitation learning", "primary_area": "", "supplementary_material": "", "author": "Daiki Kimura;Subhajit Chaudhury;Ryuki Tachibana;Sakyasingha Dasgupta", "authorids": "daiki@jp.ibm.com;subhajit@jp.ibm.com;ryuki@jp.ibm.com;sakya@leapmind.io", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nkimura2018reward,\ntitle={Reward Estimation via State Prediction},\nauthor={Daiki Kimura and Subhajit Chaudhury and Ryuki Tachibana and Sakyasingha Dasgupta},\nyear={2018},\nurl={https://openreview.net/forum?id=HktXuGb0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HktXuGb0-", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;3", "rating_avg": 4.0, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:U3y3fjY1qVoJ:scholar.google.com/&scioq=Reward+Estimation+via+State+Prediction&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "All-but-the-Top: Simple and Effective Postprocessing for Word Representations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/298", "id": "HkuGJ3kCb", "author_site": "Jiaqi Mu, Pramod Viswanath", "tldr": "", "abstract": "Real-valued word representations have transformed NLP applications; popular examples are word2vec and GloVe, recognized for their ability to capture linguistic regularities. In this paper, we demonstrate a {\\em very simple}, and yet counter-intuitive, postprocessing technique -- eliminate the common mean vector and a few top dominating directions from the word vectors -- that renders off-the-shelf representations {\\em even stronger}. The postprocessing is empirically validated on a variety of lexical-level intrinsic tasks (word similarity, concept categorization, word analogy) and sentence-level tasks (semantic textural similarity and text classification) on multiple datasets and with a variety of representation methods and hyperparameter choices in multiple languages; in each case, the processed representations are consistently better than the original ones. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiaqi Mu;Pramod Viswanath", "authorids": "jiaqimu2@illinois.edu;pramodv@illinois.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nmu2018allbutthetop,\ntitle={All-but-the-Top: Simple and Effective Postprocessing for Word Representations},\nauthor={Jiaqi Mu and Pramod Viswanath},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HkuGJ3kCb},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 4 community implementations](https://paperswithcode.com/paper/?openreview=HkuGJ3kCb)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "5;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 391, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15952875828745660459&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HkuGJ3kCb", "pdf": "https://openreview.net/pdf?id=HkuGJ3kCb", "email": ";", "author_num": 2 }, { "title": "Skip Connections Eliminate Singularities", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/73", "id": "HkwBEMWCZ", "author_site": "Emin Orhan, Xaq Pitkow", "tldr": "Degenerate manifolds arising from the non-identifiability of the model slow down learning in deep networks; skip connections help by breaking degeneracies.", "abstract": "Skip connections made the training of very deep networks possible and have become an indispensable component in a variety of neural architectures. A completely satisfactory explanation for their success remains elusive. Here, we present a novel explanation for the benefits of skip connections in training very deep networks. The difficulty of training deep networks is partly due to the singularities caused by the non-identifiability of the model. Several such singularities have been identified in previous works: (i) overlap singularities caused by the permutation symmetry of nodes in a given layer, (ii) elimination singularities corresponding to the elimination, i.e. consistent deactivation, of nodes, (iii) singularities generated by the linear dependence of the nodes. These singularities cause degenerate manifolds in the loss landscape that slow down learning. We argue that skip connections eliminate these singularities by breaking the permutation symmetry of nodes, by reducing the possibility of node elimination and by making the nodes less linearly dependent. Moreover, for typical initializations, skip connections move the network away from the \"ghosts\" of these singularities and sculpt the landscape around them to alleviate the learning slow-down. These hypotheses are supported by evidence from simplified models, as well as from experiments with deep networks trained on real-world datasets.", "keywords": "deep learning;optimization;skip connections", "primary_area": "", "supplementary_material": "", "author": "Emin Orhan;Xaq Pitkow", "authorids": "aeminorhan@gmail.com;xaq@rice.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\norhan2018skip,\ntitle={Skip Connections Eliminate Singularities},\nauthor={Emin Orhan and Xaq Pitkow},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HkwBEMWCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;8;8", "confidence": "4;3;3", "rating_avg": 7.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 349, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10390476970198630625&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HkwBEMWCZ", "pdf": "https://openreview.net/pdf?id=HkwBEMWCZ", "email": ";", "author_num": 2 }, { "title": "Skip RNN: Learning to Skip State Updates in Recurrent Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/312", "id": "HkwVAXyCW", "author_site": "V\u00edctor Campos, Brendan Jou, Xavier Giro-i-Nieto, Jordi Torres, Shih-Fu Chang", "tldr": "A modification for existing RNN architectures which allows them to skip state updates while preserving the performance of the original architectures.", "abstract": "Recurrent Neural Networks (RNNs) continue to show outstanding performance in sequence modeling tasks. However, training RNNs on long sequences often face challenges like slow inference, vanishing gradients and difficulty in capturing long term dependencies. In backpropagation through time settings, these issues are tightly coupled with the large, sequential computational graph resulting from unfolding the RNN in time. We introduce the Skip RNN model which extends existing RNN models by learning to skip state updates and shortens the effective size of the computational graph. This model can also be encouraged to perform fewer state updates through a budget constraint. We evaluate the proposed model on various tasks and show how it can reduce the number of required RNN updates while preserving, and sometimes even improving, the performance of the baseline RNN models. Source code is publicly available at https://imatge-upc.github.io/skiprnn-2017-telecombcn/.", "keywords": "recurrent neural networks;dynamic learning;conditional computation", "primary_area": "", "supplementary_material": "", "author": "V\u00edctor Campos;Brendan Jou;Xavier Gir\u00f3-i-Nieto;Jordi Torres;Shih-Fu Chang", "authorids": "victor.campos@bsc.es;bjou@google.com;xavier.giro@upc.edu;jordi.torres@bsc.es;shih.fu.chang@columbia.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ncampos2018skip,\ntitle={Skip {RNN}: Learning to Skip State Updates in Recurrent Neural Networks},\nauthor={V\u00edctor Campos and Brendan Jou and Xavier Gir\u00f3-i-Nieto and Jordi Torres and Shih-Fu Chang},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HkwVAXyCW},\n}", "github": "[![github](/images/github_icon.svg) imatge-upc/skiprnn-2017-telecombcn](https://github.com/imatge-upc/skiprnn-2017-telecombcn) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=HkwVAXyCW)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 292, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4452574796134429216&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15, "openreview": "https://openreview.net/forum?id=HkwVAXyCW", "pdf": "https://openreview.net/pdf?id=HkwVAXyCW", "email": ";;;;", "author_num": 5 }, { "title": "Breaking the Softmax Bottleneck: A High-Rank RNN Language Model", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/70", "id": "HkwZSG-CZ", "author_site": "Zhilin Yang, Zihang Dai, Ruslan Salakhutdinov, William W Cohen", "tldr": "", "abstract": "We formulate language modeling as a matrix factorization problem, and show that the expressiveness of Softmax-based models (including the majority of neural language models) is limited by a Softmax bottleneck. Given that natural language is highly context-dependent, this further implies that in practice Softmax with distributed word embeddings does not have enough capacity to model natural language. We propose a simple and effective method to address this issue, and improve the state-of-the-art perplexities on Penn Treebank and WikiText-2 to 47.69 and 40.68 respectively. The proposed method also excels on the large-scale 1B Word dataset, outperforming the baseline by over 5.6 points in perplexity.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhilin Yang;Zihang Dai;Ruslan Salakhutdinov;William W. Cohen", "authorids": "zhiliny@cs.cmu.edu;zander.dai@gmail.com;rsalakhu@cs.cmu.edu;wcohen@cs.cmu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nyang2018breaking,\ntitle={Breaking the Softmax Bottleneck: A High-Rank {RNN} Language Model},\nauthor={Zhilin Yang and Zihang Dai and Ruslan Salakhutdinov and William W. Cohen},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HkwZSG-CZ},\n}", "github": "[![github](/images/github_icon.svg) zihangdai/mos](https://github.com/zihangdai/mos) + [![Papers with Code](/images/pwc_icon.svg) 8 community implementations](https://paperswithcode.com/paper/?openreview=HkwZSG-CZ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "5;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 23, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 452, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15538946355362697879&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=HkwZSG-CZ", "pdf": "https://openreview.net/pdf?id=HkwZSG-CZ", "email": ";;;", "author_num": 4 }, { "id": "HkwrqtlR-", "title": "WHAT ARE GANS USEFUL FOR?", "track": "main", "status": "Reject", "tldr": "", "abstract": "GANs have shown how deep neural networks can be used for generative modeling, aiming at achieving the same impact that they brought for discriminative modeling. The first results were impressive, GANs were shown to be able to generate samples in high dimensional structured spaces, like images and text, that were no copies of the training data. But generative and discriminative learning are quite different. Discriminative learning has a clear end, while generative modeling is an intermediate step to understand the data or generate hypothesis. The quality of implicit density estimation is hard to evaluate, because we cannot tell how well a data is represented by the model. How can we certainly say that a generative process is generating natural images with the same distribution as we do? In this paper, we noticed that even though GANs might not be able to generate samples from the underlying distribution (or we cannot tell at least), they are capturing some structure of the data in that high dimensional space. It is therefore needed to address how we can leverage those estimates produced by GANs in the same way we are able to use other generative modeling algorithms.", "keywords": "Generative Modeling;Generative Adversarial Networks;Density Estimation", "primary_area": "", "supplementary_material": "", "author": "Pablo M. Olmos;Briland Hitaj;Paolo Gasti;Giuseppe Ateniese;Fernando Perez-Cruz", "authorids": "olmos@tsc.uc3m.es;bhitaj@stevens.edu;pgasti@nyit.edu;gatenies@stevens.edu;fernando.perezcruz@sdsc.ethz.ch", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nm.2018what,\ntitle={{WHAT} {ARE} {GANS} {USEFUL} {FOR}?},\nauthor={Pablo M. Olmos and Briland Hitaj and Paolo Gasti and Giuseppe Ateniese and Fernando Perez-Cruz},\nyear={2018},\nurl={https://openreview.net/forum?id=HkwrqtlR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HkwrqtlR-", "pdf_size": 0, "rating": "3;3;3", "confidence": "5;4;5", "rating_avg": 3.0, "confidence_avg": 4.666666666666667, "replies_avg": 4, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5290386086773534178&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Sparse Persistent RNNs: Squeezing Large Recurrent Networks On-Chip", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/200", "id": "HkxF5RgC-", "author_site": "Feiwen Zhu, Jeff Pool, Michael Andersch, Jeremy Appleyard, Fung Xie", "tldr": "Combining network pruning and persistent kernels into a practical, fast, and accurate network implementation.", "abstract": "Recurrent Neural Networks (RNNs) are powerful tools for solving sequence-based problems, but their efficacy and execution time are dependent on the size of the network. Following recent work in simplifying these networks with model pruning and a novel mapping of work onto GPUs, we design an efficient implementation for sparse RNNs. We investigate several optimizations and tradeoffs: Lamport timestamps, wide memory loads, and a bank-aware weight layout. With these optimizations, we achieve speedups of over 6x over the next best algorithm for a hidden layer of size 2304, batch size of 4, and a density of 30%. Further, our technique allows for models of over 5x the size to fit on a GPU for a speedup of 2x, enabling larger networks to help advance the state-of-the-art. We perform case studies on NMT and speech recognition tasks in the appendix, accelerating their recurrent layers by up to 3x.", "keywords": "Sparsity;Pruning;Compression;RNN;LSTM;Persistent;RF-Resident;GPU", "primary_area": "", "supplementary_material": "", "author": "Feiwen Zhu;Jeff Pool;Michael Andersch;Jeremy Appleyard;Fung Xie", "authorids": "mzhu@nvidia.com;jpool@nvidia.com;mandersch@nvidia.com;jappleyard@nvidia.com;ftse@nvidia.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nzhu2018sparse,\ntitle={Sparse Persistent {RNN}s: Squeezing Large Recurrent Networks On-Chip},\nauthor={Feiwen Zhu and Jeff Pool and Michael Andersch and Jeremy Appleyard and Fung Xie},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HkxF5RgC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;6", "confidence": "2;2;4", "rating_avg": 6.0, "confidence_avg": 2.6666666666666665, "replies_avg": 9, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16979469535338999392&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HkxF5RgC-", "pdf": "https://openreview.net/pdf?id=HkxF5RgC-", "email": ";;;;", "author_num": 5 }, { "id": "Hy-lXyDWG", "title": "Incremental Learning in Deep Convolutional Neural Networks Using Partial Network Sharing", "track": "main", "status": "Withdraw", "tldr": "The paper is about a new energy-efficient methodology for Incremental learning", "abstract": "Deep convolutional neural network (DCNN) based supervised learning is a widely practiced approach for large-scale image classification. However, retraining these large networks to accommodate new, previously unseen data demands high computational time and energy requirements. Also, previously seen training samples may not be available at the time of retraining. We propose an efficient training methodology and incrementally growing a DCNN to allow new classes to be learned while sharing part of the base network. Our proposed methodology is inspired by transfer learning techniques, although it does not forget previously learned classes. An updated network for learning new set of classes is formed using previously learned convolutional layers (shared from initial part of base network) with addition of few newly added convolutional kernels included in the later layers of the network. We evaluated the proposed scheme on several recognition applications. The classification accuracy achieved by our approach is comparable to the regular incremental learning approach (where networks are updated with new training samples only, without any network sharing).", "keywords": "Deep learning;Incremental learning;energy-efficient learning;supervised learning", "primary_area": "", "supplementary_material": "", "author": "Syed Shakib Sarwar;Aayush Ankit;Kaushik Roy", "authorids": "sarwar@purdue.edu;aankit@purdue.edu;kaushik@purdue.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Hy-lXyDWG", "pdf_size": 0, "rating": "2;4;4", "confidence": "5;5;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.666666666666667, "replies_avg": 3, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 158, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4624498856277808350&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "id": "Hy1d-ebAb", "title": "Learning Deep Generative Models of Graphs", "track": "main", "status": "Workshop", "tldr": "We study the graph generation problem and propose a powerful deep generative model capable of generating arbitrary graphs.", "abstract": "Graphs are fundamental data structures required to model many important real-world data, from knowledge graphs, physical and social interactions to molecules and proteins. In this paper, we study the problem of learning generative models of graphs from a dataset of graphs of interest. After learning, these models can be used to generate samples with similar properties as the ones in the dataset. Such models can be useful in a lot of applications, e.g. drug discovery and knowledge graph construction. The task of learning generative models of graphs, however, has its unique challenges. In particular, how to handle symmetries in graphs and ordering of its elements during the generation process are important issues. We propose a generic graph neural net based model that is capable of generating any arbitrary graph. We study its performance on a few graph generation tasks compared to baselines that exploit domain knowledge. We discuss potential issues and open problems for such generative models going forward.", "keywords": "Generative Model of Graphs", "primary_area": "", "supplementary_material": "", "author": "Yujia Li;Oriol Vinyals;Chris Dyer;Razvan Pascanu;Peter Battaglia", "authorids": "yujiali@google.com;vinyals@google.com;cdyer@google.com;razp@google.com;peterbattaglia@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nli2018learning,\ntitle={Learning Deep Generative Models of Graphs},\nauthor={Yujia Li and Oriol Vinyals and Chris Dyer and Razvan Pascanu and Peter Battaglia},\nyear={2018},\nurl={https://openreview.net/forum?id=Hy1d-ebAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=Hy1d-ebAb", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;3;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 13, "authors#_avg": 5, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 793, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17513730911496359317&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "Hy3MvSlRW", "title": "Adversarial reading networks for machine comprehension", "track": "main", "status": "Reject", "tldr": "", "abstract": "Machine reading has recently shown remarkable progress thanks to differentiable\nreasoning models. In this context, End-to-End trainable Memory Networks\n(MemN2N) have demonstrated promising performance on simple natural language\nbased reasoning tasks such as factual reasoning and basic deduction. However,\nthe task of machine comprehension is currently bounded to a supervised setting\nand available question answering dataset. In this paper we explore the paradigm\nof adversarial learning and self-play for the task of machine reading comprehension.\nInspired by the successful propositions in the domain of game learning, we\npresent a novel approach of training for this task that is based on the definition\nof a coupled attention-based memory model. On one hand, a reader network is\nin charge of finding answers regarding a passage of text and a question. On the\nother hand, a narrator network is in charge of obfuscating spans of text in order\nto minimize the probability of success of the reader. We experimented the model\non several question-answering corpora. The proposed learning paradigm and associated\nmodels present encouraging results.", "keywords": "machine reading;adversarial training", "primary_area": "", "supplementary_material": "", "author": "Quentin Grail;Julien Perez", "authorids": "julien.perez@naverlabs.com;julien.perez@naverlabs.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ngrail2018adversarial,\ntitle={Adversarial reading networks for machine comprehension},\nauthor={Quentin Grail and Julien Perez},\nyear={2018},\nurl={https://openreview.net/forum?id=Hy3MvSlRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Hy3MvSlRW", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;5;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.666666666666667, "replies_avg": 5, "authors#_avg": 2, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hg74Zx2OqV4J:scholar.google.com/&scioq=Adversarial+reading+networks+for+machine+comprehension&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "A Neural Representation of Sketch Drawings", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/293", "id": "Hy6GHpkCW", "author_site": "David Ha, Douglas Eck", "tldr": "We investigate alternative to traditional pixel image modelling approaches, and propose a generative model for vector images.", "abstract": "We present sketch-rnn, a recurrent neural network able to construct stroke-based drawings of common objects. The model is trained on a dataset of human-drawn images representing many different classes. We outline a framework for conditional and unconditional sketch generation, and describe new robust training methods for generating coherent sketch drawings in a vector format.", "keywords": "applications;image modelling;computer-assisted;drawing;art;creativity;dataset", "primary_area": "", "supplementary_material": "", "author": "David Ha;Douglas Eck", "authorids": "hadavid@google.com;deck@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nha2018a,\ntitle={A Neural Representation of Sketch Drawings},\nauthor={David Ha and Douglas Eck},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Hy6GHpkCW},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 19 community implementations](https://paperswithcode.com/paper/?openreview=Hy6GHpkCW)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "5;8;8", "confidence": "4;4;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 1127, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2337146750300919321&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=Hy6GHpkCW", "pdf": "https://openreview.net/pdf?id=Hy6GHpkCW", "email": ";", "author_num": 2 }, { "id": "Hy7EPh10W", "title": "Novelty Detection with GAN", "track": "main", "status": "Reject", "tldr": "We propose to solve a problem of simultaneous classification and novelty detection within the GAN framework.", "abstract": "The ability of a classifier to recognize unknown inputs is important for many classification-based systems. We discuss the problem of simultaneous classification and novelty detection, i.e. determining whether an input is from the known set of classes and from which specific class, or from an unknown domain and does not belong to any of the known classes. We propose a method based on the Generative Adversarial Networks (GAN) framework. We show that a multi-class discriminator trained with a generator that generates samples from a mixture of nominal and novel data distributions is the optimal novelty detector. We approximate that generator with a mixture generator trained with the Feature Matching loss and empirically show that the proposed method outperforms conventional methods for novelty detection. Our findings demonstrate a simple, yet powerful new application of the GAN framework for the task of novelty detection.", "keywords": "novelty detection;GAN;feature matching;semi-supervised", "primary_area": "", "supplementary_material": "", "author": "Mark Kliger;Shachar Fleishman", "authorids": "mark.kliger@gmail.com;shacharfl@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkliger2018novelty,\ntitle={Novelty Detection with {GAN}},\nauthor={Mark Kliger and Shachar Fleishman},\nyear={2018},\nurl={https://openreview.net/forum?id=Hy7EPh10W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Hy7EPh10W", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 71, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12252395220090301474&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "title": "AmbientGAN: Generative models from lossy measurements", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/231", "id": "Hy7fDog0b", "author_site": "Ashish Bora, Eric Price, Alexandros Dimakis", "tldr": "How to learn GANs from noisy, distorted, partial observations", "abstract": "Generative models provide a way to model structure in complex distributions and have been shown to be useful for many tasks of practical interest. However, current techniques for training generative models require access to fully-observed samples. In many settings, it is expensive or even impossible to obtain fully-observed samples, but economical to obtain partial, noisy observations. We consider the task of learning an implicit generative model given only lossy measurements of samples from the distribution of interest. We show that the true underlying distribution can be provably recovered even in the presence of per-sample information loss for a class of measurement models. Based on this, we propose a new method of training Generative Adversarial Networks (GANs) which we call AmbientGAN. On three benchmark datasets, and for various measurement models, we demonstrate substantial qualitative and quantitative improvements. Generative models trained with our method can obtain $2$-$4$x higher inception scores than the baselines.", "keywords": "Generative models;Adversarial networks;Lossy measurements", "primary_area": "", "supplementary_material": "", "author": "Ashish Bora;Eric Price;Alexandros G. Dimakis", "authorids": "ashish.bora@utexas.edu;ecprice@cs.utexas.edu;dimakis@austin.utexas.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nbora2018ambientgan,\ntitle={Ambient{GAN}: Generative models from lossy measurements},\nauthor={Ashish Bora and Eric Price and Alexandros G. Dimakis},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Hy7fDog0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 273, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16888678630950428815&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=Hy7fDog0b", "pdf": "https://openreview.net/pdf?id=Hy7fDog0b", "email": ";;", "author_num": 3 }, { "id": "Hy8hkYeRb", "title": "A Deep Predictive Coding Network for Learning Latent Representations", "track": "main", "status": "Reject", "tldr": "A predictive coding based learning algorithm for building deep neural network models of the brain", "abstract": "It has been argued that the brain is a prediction machine that continuously learns how to make better predictions about the stimuli received from the external environment. For this purpose, it builds a model of the world around us and uses this model to infer the external stimulus. Predictive coding has been proposed as a mechanism through which the brain might be able to build such a model of the external environment. However, it is not clear how predictive coding can be used to build deep neural network models of the brain while complying with the architectural constraints imposed by the brain. In this paper, we describe an algorithm to build a deep generative model using predictive coding that can be used to infer latent representations about the stimuli received from external environment. Specifically, we used predictive coding to train a deep neural network on real-world images in a unsupervised learning paradigm. To understand the capacity of the network with regards to modeling the external environment, we studied the latent representations generated by the model on images of objects that are never presented to the model during training. Despite the novel features of these objects the model is able to infer the latent representations for them. Furthermore, the reconstructions of the original images obtained from these latent representations preserve the important details of these objects.", "keywords": "Predictive coding;deep neural network;generative model;unsupervised learning;learning latent representations", "primary_area": "", "supplementary_material": "", "author": "Shirin Dora;Cyriel Pennartz;Sander Bohte", "authorids": "shirin.dora@gmail.com;c.m.a.pennartz@uva.nl;s.m.bohte@cwi.nl", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ndora2018a,\ntitle={A Deep Predictive Coding Network for Learning Latent Representations},\nauthor={Shirin Dora and Cyriel Pennartz and Sander Bohte},\nyear={2018},\nurl={https://openreview.net/forum?id=Hy8hkYeRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Hy8hkYeRb", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5493519506250947909&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "HyB9Np6WG", "title": "Tensor-Based Preposition Representation", "track": "main", "status": "Withdraw", "tldr": "This work is about tensor-based method for preposition representation training.", "abstract": "Prepositions are among the most frequent words. Good prepositional representation is of great syntactic and semantic interest in computational linguistics. Existing methods on preposition representation either treat prepositions as content words (e.g., word2vec and GloVe) or depend heavily on external linguistic resources including syntactic parsing, training task and dataset-specific representations. In this paper we use word-triple counts (one of the words is a preposition) to capture the preposition's interaction with its head and children. Prepositional embeddings are derived via tensor decompositions on a large unlabeled corpus. We reveal a new geometry involving Hadamard products and empirically demonstrate its utility in paraphrasing of phrasal verbs. Furthermore, our prepositional embeddings are used as simple features to two challenging downstream tasks: preposition selection and prepositional attachment disambiguation. We achieve comparable to or better results than state of the art on multiple standardized datasets. ", "keywords": "word representation;unsupervised learning;computational linguistics", "primary_area": "", "supplementary_material": "", "author": "Hongyu Gong;Suma Bhat;Pramod Viswanath", "authorids": "hgong6@illinois.edu;pramodv@illinois.edu;spbhat2@illinois.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HyB9Np6WG", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 3, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8310973120995981917&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0 }, { "id": "HyBbjW-RW", "title": "Open Loop Hyperparameter Optimization and Determinantal Point Processes", "track": "main", "status": "Reject", "tldr": "Driven by the need for parallelizable, open-loop hyperparameter optimization methods, we propose the use of $k$-determinantal point processes in hyperparameter optimization via random search.", "abstract": "Driven by the need for parallelizable hyperparameter optimization methods, this paper studies \\emph{open loop} search methods: sequences that are predetermined and can be generated before a single configuration is evaluated. Examples include grid search, uniform random search, low discrepancy sequences, and other sampling distributions.\nIn particular, we propose the use of $k$-determinantal point processes in hyperparameter optimization via random search. Compared to conventional uniform random search where hyperparameter settings are sampled independently, a $k$-DPP promotes diversity. We describe an approach that transforms hyperparameter search spaces for efficient use with a $k$-DPP. In addition, we introduce a novel Metropolis-Hastings algorithm which can sample from $k$-DPPs defined over spaces with a mixture of discrete and continuous dimensions. Our experiments show significant benefits over uniform random search in realistic scenarios with a limited budget for training supervised learners, whether in serial or parallel.", "keywords": "hyperparameter optimization;random search;determinantal point processes;low discrepancy sequences", "primary_area": "", "supplementary_material": "", "author": "Jesse Dodge;Kevin Jamieson;Noah A. Smith", "authorids": "jessed@cs.cmu.edu;jamieson@cs.washington.edu;nasmith@cs.washington.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ndodge2018open,\ntitle={Open Loop Hyperparameter Optimization and Determinantal Point Processes},\nauthor={Jesse Dodge and Kevin Jamieson and Noah A. Smith},\nyear={2018},\nurl={https://openreview.net/forum?id=HyBbjW-RW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HyBbjW-RW", "pdf_size": 0, "rating": "4;4;4", "confidence": "5;5;5", "rating_avg": 4.0, "confidence_avg": 5.0, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9326564281667380212&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "HyDAQl-AW", "title": "Time Limits in Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "We consider the problem of learning optimal policies in time-limited and time-unlimited domains using time-limited interactions.", "abstract": "In reinforcement learning, it is common to let an agent interact with its environment for a fixed amount of time before resetting the environment and repeating the process in a series of episodes. The task that the agent has to learn can either be to maximize its performance over (i) that fixed amount of time, or (ii) an indefinite period where the time limit is only used during training. In this paper, we investigate theoretically how time limits could effectively be handled in each of the two cases. In the first one, we argue that the terminations due to time limits are in fact part of the environment, and propose to include a notion of the remaining time as part of the agent's input. In the second case, the time limits are not part of the environment and are only used to facilitate learning. We argue that such terminations should not be treated as environmental ones and propose a method, specific to value-based algorithms, that incorporates this insight by continuing to bootstrap at the end of each partial episode. To illustrate the significance of our proposals, we perform several experiments on a range of environments from simple few-state transition graphs to complex control tasks, including novel and standard benchmark domains. Our results show that the proposed methods improve the performance and stability of existing reinforcement learning algorithms.", "keywords": "reinforcement learning;Markov decision processes;deep learning", "primary_area": "", "supplementary_material": "", "author": "Fabio Pardo;Arash Tavakoli;Vitaly Levdik;Petar Kormushev", "authorids": "f.pardo@imperial.ac.uk;a.tavakoli@imperial.ac.uk;v.levdik@imperial.ac.uk;p.kormushev@imperial.ac.uk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\npardo2018time,\ntitle={Time Limits in Reinforcement Learning},\nauthor={Fabio Pardo and Arash Tavakoli and Vitaly Levdik and Petar Kormushev},\nyear={2018},\nurl={https://openreview.net/forum?id=HyDAQl-AW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=HyDAQl-AW", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 211, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1049564009410089349&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12 }, { "id": "HyDMX0l0Z", "title": "Towards Effective GANs for Data Distributions with Diverse Modes", "track": "main", "status": "Workshop", "tldr": "We introduce theory to explain the failure of GANs on complex datasets and propose a solution to fix it.", "abstract": "Generative Adversarial Networks (GANs), when trained on large datasets with diverse modes, are known to produce conflated images which do not distinctly belong to any of the modes. We hypothesize that this problem occurs due to the interaction between two facts: (1) For datasets with large variety, it is likely that the modes lie on separate manifolds. (2) The generator (G) is formulated as a continuous function, and the input noise is derived from a connected set, due to which G's output is a connected set. If G covers all modes, then there must be some portion of G's output which connects them. This corresponds to undesirable, conflated images. We develop theoretical arguments to support these intuitions. We propose a novel method to break the second assumption via learnable discontinuities in the latent noise space. Equivalently, it can be viewed as training several generators, thus creating discontinuities in the G function. We also augment the GAN formulation with a classifier C that predicts which noise partition/generator produced the output images, encouraging diversity between each partition/generator. We experiment on MNIST, celebA, STL-10, and a difficult dataset with clearly distinct modes, and show that the noise partitions correspond to different modes of the data distribution, and produce images of superior quality.", "keywords": "generative adversarial networks;GANs;deep learning;unsupervised learning;generative models;adversarial learning", "primary_area": "", "supplementary_material": "", "author": "Sanchit Agrawal;Gurneet Singh;Mitesh Khapra", "authorids": "sanchit@cse.iitm.ac.in;garry@cse.iitm.ac.in;miteshk@cse.iitm.ac.in", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nagrawal2018towards,\ntitle={Towards Effective {GAN}s for Data Distributions with Diverse Modes},\nauthor={Sanchit Agrawal and Gurneet Singh and Mitesh Khapra},\nyear={2018},\nurl={https://openreview.net/forum?id=HyDMX0l0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HyDMX0l0Z", "pdf_size": 0, "rating": "4;4;6", "confidence": "3;3;5", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kUvHnXT9fQwJ:scholar.google.com/&scioq=Towards+Effective+GANs+for+Data+Distributions+with+Diverse+Modes&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HyEi7bWR-", "title": "Orthogonal Recurrent Neural Networks with Scaled Cayley Transform", "track": "main", "status": "Reject", "tldr": "A novel approach to maintain orthogonal recurrent weight matrices in a RNN.", "abstract": "Recurrent Neural Networks (RNNs) are designed to handle sequential data but suffer from vanishing or exploding gradients. Recent work on Unitary Recurrent Neural Networks (uRNNs) have been used to address this issue and in some cases, exceed the capabilities of Long Short-Term Memory networks (LSTMs). We propose a simpler and novel update scheme to maintain orthogonal recurrent weight matrices without using complex valued matrices. This is done by parametrizing with a skew-symmetric matrix using the Cayley transform. Such a parametrization is unable to represent matrices with negative one eigenvalues, but this limitation is overcome by scaling the recurrent weight matrix by a diagonal matrix consisting of ones and negative ones. The proposed training scheme involves a straightforward gradient calculation and update step. In several experiments, the proposed scaled Cayley orthogonal recurrent neural network (scoRNN) achieves superior results with fewer trainable parameters than other unitary RNNs.", "keywords": "recurrent neural networks;vanishing gradients;exploding gradients;orthogonal;unitary;long term dependencies;uRNN", "primary_area": "", "supplementary_material": "", "author": "Kyle Helfrich;Devin Willmott;Qiang Ye", "authorids": "kyle.helfrich@uky.edu;devin.willmott@uky.edu;qiang.ye@uky.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nhelfrich2018orthogonal,\ntitle={Orthogonal Recurrent Neural Networks with Scaled Cayley Transform},\nauthor={Kyle Helfrich and Devin Willmott and Qiang Ye},\nyear={2018},\nurl={https://openreview.net/forum?id=HyEi7bWR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HyEi7bWR-", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;3;3", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 162, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10576322947857760953&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "HyFaiGbCW", "title": "Generalization of Learning using Reservoir Computing", "track": "main", "status": "Reject", "tldr": "Generalization of the relationships learnt between pairs of images using a small training data to previously unseen types of images using an explainable dynamical systems model, Reservoir Computing, and a biologically plausible learning technique based on analogies.", "abstract": "We investigate the methods by which a Reservoir Computing Network (RCN) learns concepts such as 'similar' and 'different' between pairs of images using a small training dataset and generalizes these concepts to previously unseen types of data. Specifically, we show that an RCN trained to identify relationships between image-pairs drawn from a subset of digits from the MNIST database or the depth maps of subset of visual scenes from a moving camera generalizes the learned transformations to images of digits unseen during training or depth maps of different visual scenes. We infer, using Principal Component Analysis, that the high dimensional reservoir states generated from an input image pair with a specific transformation converge over time to a unique relationship. Thus, as opposed to training the entire high dimensional reservoir state, the reservoir only needs to train on these unique relationships, allowing the reservoir to perform well with very few training examples. Thus, generalization of learning to unseen images is interpretable in terms of clustering of the reservoir state onto the attractor corresponding to the transformation in reservoir space. We find that RCNs can identify and generalize linear and non-linear transformations, and combinations of transformations, naturally and be a robust and effective image classifier. Additionally, RCNs perform significantly better than state of the art neural network classification techniques such as deep Siamese Neural Networks (SNNs) in generalization tasks both on the MNIST dataset and more complex depth maps of visual scenes from a moving camera. This work helps bridge the gap between explainable machine learning and biological learning through analogies using small datasets, and points to new directions in the investigation of learning processes.", "keywords": "Generalization;Reservoir Computing;dynamical system;Siamese Neural Network;image classification;similarity;dimensionality reduction", "primary_area": "", "supplementary_material": "", "author": "Sanjukta Krishnagopal;Yiannis Aloimonos;Michelle Girvan", "authorids": "sanjukta@umd.edu;yiannis@cs.umd.edu;girvan@umd.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nkrishnagopal2018generalization,\ntitle={Generalization of Learning using Reservoir Computing},\nauthor={Sanjukta Krishnagopal and Yiannis Aloimonos and Michelle Girvan},\nyear={2018},\nurl={https://openreview.net/forum?id=HyFaiGbCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HyFaiGbCW", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;3;5", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11157306896563211945&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "title": "Variational Message Passing with Structured Inference Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/136", "id": "HyH9lbZAW", "author_site": "Wu Lin, Nicolas Daniel Hubacher, Mohammad Emtiyaz Khan", "tldr": "We propose a variational message-passing algorithm for models that contain both the deep model and probabilistic graphical model.", "abstract": "Recent efforts on combining deep models with probabilistic graphical models are promising in providing flexible models that are also easy to interpret. We propose a variational message-passing algorithm for variational inference in such models. We make three contributions. First, we propose structured inference networks that incorporate the structure of the graphical model in the inference network of variational auto-encoders (VAE). Second, we establish conditions under which such inference networks enable fast amortized inference similar to VAE. Finally, we derive a variational message passing algorithm to perform efficient natural-gradient inference while retaining the efficiency of the amortized inference. By simultaneously enabling structured, amortized, and natural-gradient inference for deep structured models, our method simplifies and generalizes existing methods.", "keywords": "Variational Inference;Variational Message Passing;Variational Auto-Encoder;Graphical Models;Structured Models;Natural Gradients", "primary_area": "", "supplementary_material": "", "author": "Wu Lin;Nicolas Hubacher;Mohammad Emtiyaz Khan", "authorids": "wlin2018@cs.ubc.ca;nicolas.hubacher@outlook.com;emtiyaz@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nlin2018variational,\ntitle={Variational Message Passing with Structured Inference Networks},\nauthor={Wu Lin and Mohammad Emtiyaz Khan and Nicolas Hubacher},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HyH9lbZAW},\n}", "github": "[![github](/images/github_icon.svg) emtiyaz/vmp-for-svae](https://github.com/emtiyaz/vmp-for-svae)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;3;2", "rating_avg": 7.0, "confidence_avg": 3.0, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4788714492758509312&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HyH9lbZAW", "pdf": "https://openreview.net/pdf?id=HyH9lbZAW", "email": ";;", "author_num": 3 }, { "id": "HyHmGyZCZ", "title": "Comparison of Paragram and GloVe Results for Similarity Benchmarks", "track": "main", "status": "Reject", "tldr": "Paper provides a description of a procedure to enhance word vector space model with an evaluation of Paragram and GloVe models for Similarity Benchmarks.", "abstract": "Distributional Semantics Models(DSM) derive word space from linguistic items\nin context. Meaning is obtained by defining a distance measure between vectors\ncorresponding to lexical entities. Such vectors present several problems. This\nwork concentrates on quality of word embeddings, improvement of word embedding\nvectors, applicability of a novel similarity metric used \u2018on top\u2019 of the\nword embeddings. In this paper we provide comparison between two methods\nfor post process improvements to the baseline DSM vectors. The counter-fitting\nmethod which enforces antonymy and synonymy constraints into the Paragram\nvector space representations recently showed improvement in the vectors\u2019 capability\nfor judging semantic similarity. The second method is our novel RESM\nmethod applied to GloVe baseline vectors. By applying the hubness reduction\nmethod, implementing relational knowledge into the model by retrofitting synonyms\nand providing a new ranking similarity definition RESM that gives maximum\nweight to the top vector component values we equal the results for the ESL\nand TOEFL sets in comparison with our calculations using the Paragram and Paragram\n+ Counter-fitting methods. For SIMLEX-999 gold standard since we cannot\nuse the RESM the results using GloVe and PPDB are significantly worse compared\nto Paragram. Apparently, counter-fitting corrects hubness. The Paragram\nor our cosine retrofitting method are state-of-the-art results for the SIMLEX-999\ngold standard. They are 0.2 better for SIMLEX-999 than word2vec with sense\nde-conflation (that was announced to be state-of the-art method for less reliable\ngold standards). Apparently relational knowledge and counter-fitting is more important\nfor judging semantic similarity than sense determination for words. It is to\nbe mentioned, though that Paragram hyperparameters are fitted to SIMLEX-999\nresults. The lesson is that many corrections to word embeddings are necessary\nand methods with more parameters and hyperparameters perform better.\n", "keywords": "language models;vector spaces;word embedding;similarity", "primary_area": "", "supplementary_material": "", "author": "Jakub Dutkiewicz;Czes\u0142aw J\u0119drzejek", "authorids": "jakub.dutkiewicz@put.poznan.pl;czeslaw.jedrzejek@put.poznan.pl", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ndutkiewicz2018comparison,\ntitle={Comparison of Paragram and GloVe Results for Similarity Benchmarks},\nauthor={Jakub Dutkiewicz and Czes\u0142aw J\u0119drzejek},\nyear={2018},\nurl={https://openreview.net/forum?id=HyHmGyZCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HyHmGyZCZ", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;4;5", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 2, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gOsN9ia0QpkJ:scholar.google.com/&scioq=Comparison+of+Paragram+and+GloVe+Results+for+Similarity+Benchmarks&hl=en&as_sdt=0,33", "gs_version_total": 3 }, { "id": "HyI5ro0pW", "title": "Neural Networks with Block Diagonal Inner Product Layers", "track": "main", "status": "Reject", "tldr": "We look at neural networks with block diagonal inner product layers for efficiency.", "abstract": "Artificial neural networks have opened up a world of possibilities in data science and artificial intelligence, but neural networks are cumbersome tools that grow with the complexity of the learning problem. We make contributions to this issue by considering a modified version of the fully connected layer we call a block diagonal inner product layer. These modified layers have weight matrices that are block diagonal, turning a single fully connected layer into a set of densely connected neuron groups. This idea is a natural extension of group, or depthwise separable, convolutional layers applied to the fully connected layers. Block diagonal inner product layers can be achieved by either initializing a purely block diagonal weight matrix or by iteratively pruning off diagonal block entries. This method condenses network storage and speeds up the run time without significant adverse effect on the testing accuracy, thus offering a new approach to improve network computation efficiency.", "keywords": "Deep Learning;Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Amy Nesky;Quentin Stout", "authorids": "anesky@umich.edu;qstout@umich.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nnesky2018neural,\ntitle={Neural Networks with Block Diagonal Inner Product Layers},\nauthor={Amy Nesky and Quentin Stout},\nyear={2018},\nurl={https://openreview.net/forum?id=HyI5ro0pW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HyI5ro0pW", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;4;4", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3850997309955413811&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "HyI6s40a-", "title": "Towards Safe Deep Learning: Unsupervised Defense Against Generic Adversarial Attacks", "track": "main", "status": "Reject", "tldr": "Devising unsupervised defense mechanisms against adversarial attacks is crucial to ensure the generalizability of the defense. ", "abstract": "Recent advances in adversarial Deep Learning (DL) have opened up a new and largely unexplored surface for malicious attacks jeopardizing the integrity of autonomous DL systems. We introduce a novel automated countermeasure called Parallel Checkpointing Learners (PCL) to thwart the potential adversarial attacks and significantly improve the reliability (safety) of a victim DL model. The proposed PCL methodology is unsupervised, meaning that no adversarial sample is leveraged to build/train parallel checkpointing learners. We formalize the goal of preventing adversarial attacks as an optimization problem to minimize the rarely observed regions in the latent feature space spanned by a DL network. To solve the aforementioned minimization problem, a set of complementary but disjoint checkpointing modules are trained and leveraged to validate the victim model execution in parallel. Each checkpointing learner explicitly characterizes the geometry of the input data and the corresponding high-level data abstractions within a particular DL layer. As such, the adversary is required to simultaneously deceive all the defender modules in order to succeed. We extensively evaluate the performance of the PCL methodology against the state-of-the-art attack scenarios, including Fast-Gradient-Sign (FGS), Jacobian Saliency Map Attack (JSMA), Deepfool, and Carlini&WagnerL2 algorithm. Extensive proof-of-concept evaluations for analyzing various data collections including MNIST, CIFAR10, and ImageNet corroborate the effectiveness of our proposed defense mechanism against adversarial samples. ", "keywords": "Adversarial Attacks;Unsupervised Defense;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Bita Darvish Rouhani;Mohammad Samragh;Tara Javidi;Farinaz Koushanfar", "authorids": "bita@ucsd.edu;msamragh@ucsd.edu;tjavidi@ucsd.edu;farinaz@ucsd.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ndarvish2018towards,\ntitle={Towards Safe Deep Learning: Unsupervised Defense Against Generic Adversarial Attacks},\nauthor={Bita Darvish Rouhani and Mohammad Samragh and Tara Javidi and Farinaz Koushanfar},\nyear={2018},\nurl={https://openreview.net/forum?id=HyI6s40a-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HyI6s40a-", "pdf_size": 0, "rating": "3;5;7", "confidence": "5;3;3", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844387, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16912933901147643912&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "HyIFzx-0b", "title": "BinaryFlex: On-the-Fly Kernel Generation in Binary Convolutional Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this work we present BinaryFlex, a neural network architecture that learns weighting coefficients of predefined orthogonal binary basis instead of the conventional approach of learning directly the convolutional filters. We have demonstrated the feasibility of our approach for complex computer vision datasets such as ImageNet. Our architecture trained on ImageNet is able to achieve top-5 accuracy of 65.7% while being around 2x smaller than binary networks capable of achieving similar accuracy levels. By using deterministic basis, that can be generated on-the-fly very efficiently, our architecture offers a great deal of flexibility in memory footprint when deploying in constrained microcontroller devices.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vincent W.-S. Tseng;Sourav Bhattachary;Javier Fern\u00e1ndez Marqu\u00e9s;Milad Alizadeh;Catherine Tong;Nicholas Donald Lane", "authorids": "wt262@cornell.edu;sourav.bhattacharya@nokia-bell-labs.com;javier.fernandezmarques@cs.ox.ac.uk;milad.alizadeh@cs.ox.ac.uk;eu.tong@cs.ox.ac.uk;nicholas.lane@cs.ox.uk", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nw.-s.2018binaryflex,\ntitle={BinaryFlex: On-the-Fly Kernel Generation in Binary Convolutional Networks},\nauthor={Vincent W.-S. Tseng and Sourav Bhattachary and Javier Fern\u00e1ndez Marqu\u00e9s and Milad Alizadeh and Catherine Tong and Nicholas Donald Lane},\nyear={2018},\nurl={https://openreview.net/forum?id=HyIFzx-0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HyIFzx-0b", "pdf_size": 0, "rating": "3;5;5", "confidence": "4;3;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 8, "authors#_avg": 6, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_w70WEDGodsJ:scholar.google.com/&scioq=BinaryFlex:+On-the-Fly+Kernel+Generation+in+Binary+Convolutional+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HyKZyYlRZ", "title": "Large Scale Multi-Domain Multi-Task Learning with MultiModel", "track": "main", "status": "Reject", "tldr": "Large scale multi-task architecture solves ImageNet and translation together and shows transfer learning.", "abstract": "Deep learning yields great results across many fields,\nfrom speech recognition, image classification, to translation.\nBut for each problem, getting a deep model to work well involves\nresearch into the architecture and a long period of tuning.\n\nWe present a single model that yields good results on a number\nof problems spanning multiple domains. In particular, this single model\nis trained concurrently on ImageNet, multiple translation tasks,\nimage captioning (COCO dataset), a speech recognition corpus,\nand an English parsing task. \n\nOur model architecture incorporates building blocks from multiple\ndomains. It contains convolutional layers, an attention mechanism,\nand sparsely-gated layers.\n\nEach of these computational blocks is crucial for a subset of\nthe tasks we train on. Interestingly, even if a block is not\ncrucial for a task, we observe that adding it never hurts performance\nand in most cases improves it on all tasks.\n\nWe also show that tasks with less data benefit largely from joint\ntraining with other tasks, while performance on large tasks degrades\nonly slightly if at all.", "keywords": "multi-task learning;transfer learning", "primary_area": "", "supplementary_material": "", "author": "Lukasz Kaiser;Aidan N. Gomez;Noam Shazeer;Ashish Vaswani;Niki Parmar;Llion Jones;Jakob Uszkoreit", "authorids": "lukaszkaiser@google.com;aidan.n.gomez@gmail.com;noam@google.com;avaswani@google.com;nikip@google.com;llion@google.com;usz@google.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nkaiser2018large,\ntitle={Large Scale Multi-Domain Multi-Task Learning with MultiModel},\nauthor={Lukasz Kaiser and Aidan N. Gomez and Noam Shazeer and Ashish Vaswani and Niki Parmar and Llion Jones and Jakob Uszkoreit},\nyear={2018},\nurl={https://openreview.net/forum?id=HyKZyYlRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HyKZyYlRZ", "pdf_size": 0, "rating": "3;6;6", "confidence": "5;3;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 7, "corr_rating_confidence": -0.8660254037844387, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:uh7Pky3UtncJ:scholar.google.com/&scioq=Large+Scale+Multi-Domain+Multi-Task+Learning+with+MultiModel&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Kronecker-factored Curvature Approximations for Recurrent Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/12", "id": "HyMTkQZAb", "author_site": "James Martens, Jimmy Ba, Matthew Johnson", "tldr": "We extend the K-FAC method to RNNs by developing a new family of Fisher approximations.", "abstract": "Kronecker-factor Approximate Curvature (Martens & Grosse, 2015) (K-FAC) is a 2nd-order optimization method which has been shown to give state-of-the-art performance on large-scale neural network optimization tasks (Ba et al., 2017). It is based on an approximation to the Fisher information matrix (FIM) that makes assumptions about the particular structure of the network and the way it is parameterized. The original K-FAC method was applicable only to fully-connected networks, although it has been recently extended by Grosse & Martens (2016) to handle convolutional networks as well. In this work we extend the method to handle RNNs by introducing a novel approximation to the FIM for RNNs. This approximation works by modelling the covariance structure between the gradient contributions at different time-steps using a chain-structured linear Gaussian graphical model, summing the various cross-covariances, and computing the inverse in closed form. We demonstrate in experiments that our method significantly outperforms general purpose state-of-the-art optimizers like SGD with momentum and Adam on several challenging RNN training tasks.", "keywords": "optimization;K-FAC;natural gradient;recurrent neural networks", "primary_area": "", "supplementary_material": "", "author": "James Martens;Jimmy Ba;Matt Johnson", "authorids": "james.martens@gmail.com;jimmy@psi.toronto.edu;mattjj@csail.mit.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nmartens2018kroneckerfactored,\ntitle={Kronecker-factored Curvature Approximations for Recurrent Neural Networks},\nauthor={James Martens and Jimmy Ba and Matt Johnson},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HyMTkQZAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;7;7", "confidence": "4;3;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 114, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10559967473707434655&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HyMTkQZAb", "pdf": "https://openreview.net/pdf?id=HyMTkQZAb", "email": ";;", "author_num": 3 }, { "id": "HyN-ZvlC-", "title": "Large Margin Neural Language Models", "track": "main", "status": "Reject", "tldr": "Enhance the language model for supervised learning task ", "abstract": "Neural language models (NLMs) are generative, and they model the distribution of grammatical sentences. Trained on huge corpus, NLMs are pushing the limit of modeling accuracy. Besides, they have also been applied to supervised learning tasks that decode text, e.g., automatic speech recognition (ASR). By re-scoring the n-best list, NLM can select grammatically more correct candidate among the list, and significantly reduce word/char error rate. However, the generative nature of NLM may not guarantee a discrimination between \u201cgood\u201d and \u201cbad\u201d (in a task-specific sense) sentences, resulting in suboptimal performance. This work proposes an approach to adapt a generative NLM to a discriminative one. Different from the commonly used maximum likelihood objective, the proposed method aims at enlarging the margin between the \u201cgood\u201d and \u201cbad\u201d sentences. It is trained end-to-end and can be widely applied to tasks that involve the re-scoring of the decoded text. Significant gains are observed in both ASR and statistical machine translation (SMT) tasks.", "keywords": "Language Model;discriminative model", "primary_area": "", "supplementary_material": "", "author": "Jiaji Huang;Yi Li;Wei Ping;Sanjeev Satheesh;Gregory Diamos", "authorids": "huangjiaji@baidu.com;liyi17@baidu.com;pingwei01@baidu.com;sanjeevsatheesh@baidu.com;gregdiamos@baidu.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nhuang2018large,\ntitle={Large Margin Neural Language Models},\nauthor={Jiaji Huang and Yi Li and Wei Ping and Sanjeev Satheesh and Gregory Diamos},\nyear={2018},\nurl={https://openreview.net/forum?id=HyN-ZvlC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HyN-ZvlC-", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;5;5", "rating_avg": 5.666666666666667, "confidence_avg": 4.666666666666667, "replies_avg": 7, "authors#_avg": 5, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12607896468167082164&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "HyNyyCxA-", "title": "Confidence Scoring Using Whitebox Meta-models with Linear Classifier Probes", "track": "main", "status": "Active", "tldr": "", "abstract": "We propose a confidence scoring mechanism for multi-layer neural networks based on a paradigm of a base model and a meta-model. The confidence score is learned by the meta-model using features derived from the base model \u2013 a deep neural network considered a whitebox. As features, we investigate linear classifier probes inserted between the various layers of the base model and trained using each layer\u2019s intermediate activations. Experiments show that this approach outperforms various baselines in a filtering task, i.e., task of rejecting samples with low confidence. Experimental results are presented using CIFAR-10 and CIFAR-100 dataset with and without added noise exploring various aspects of the method.", "keywords": "confidence scoring;meta-model;linear classifier probes", "primary_area": "", "supplementary_material": "", "author": "Anonymous", "authorids": "ICLR.cc/2018/Conference/Paper437/Authors", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@article{ \nanonymous2018confidence, \ntitle={Confidence Scoring Using Whitebox Meta-models with Linear Classifier Probes}, \nauthor={Anonymous}, \njournal={International Conference on Learning Representations}, \nyear={2018} \n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=HyNyyCxA-", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 0, "authors#_avg": 1, "corr_rating_confidence": 0, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18308344661191182254&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "HyPpD0g0Z", "title": "Grouping-By-ID: Guarding Against Adversarial Domain Shifts", "track": "main", "status": "Reject", "tldr": "We propose counterfactual regularization to guard against adversarial domain shifts arising through shifts in the distribution of latent \"style features\" of images.", "abstract": "When training a deep neural network for supervised image classification, one can broadly distinguish between two types of latent features of images that will drive the classification of class Y. Following the notation of Gong et al. (2016), we can divide features broadly into the classes of (i) \u201ccore\u201d or \u201cconditionally invariant\u201d features X^ci whose distribution P(X^ci | Y) does not change substantially across domains and (ii) \u201cstyle\u201d or \u201corthogonal\u201d features X^orth whose distribution P(X^orth | Y) can change substantially across domains. These latter orthogonal features would generally include features such as position, rotation, image quality or brightness but also more complex ones like hair color or posture for images of persons. We try to guard against future adversarial domain shifts by ideally just using the \u201cconditionally invariant\u201d features for classification. In contrast to previous work, we assume that the domain itself is not observed and hence a latent variable. We can hence not directly see the distributional change of features across different domains. \n\nWe do assume, however, that we can sometimes observe a so-called identifier or ID variable. We might know, for example, that two images show the same person, with ID referring to the identity of the person. In data augmentation, we generate several images from the same original image, with ID referring to the relevant original image. The method requires only a small fraction of images to have an ID variable.\n\nWe provide a causal framework for the problem by adding the ID variable to the model of Gong et al. (2016). However, we are interested in settings where we cannot observe the domain directly and we treat domain as a latent variable. If two or more samples share the same class and identifier, (Y, ID)=(y,i), then we treat those samples as counterfactuals under different style interventions on the orthogonal or style features. Using this grouping-by-ID approach, we regularize the network to provide near constant output across samples that share the same ID by penalizing with an appropriate graph Laplacian. This is shown to substantially improve performance in settings where domains change in terms of image quality, brightness, color changes, and more complex changes such as changes in movement and posture. We show links to questions of interpretability, fairness and transfer learning.", "keywords": "supervised representation learning;causality;interpretability;transfer learning", "primary_area": "", "supplementary_material": "", "author": "Christina Heinze-Deml;Nicolai Meinshausen", "authorids": "heinzedeml@stat.math.ethz.ch;meinshausen@stat.math.ethz.ch", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nheinze-deml2018groupingbyid,\ntitle={Grouping-By-{ID}: Guarding Against Adversarial Domain Shifts},\nauthor={Christina Heinze-Deml and Nicolai Meinshausen},\nyear={2018},\nurl={https://openreview.net/forum?id=HyPpD0g0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HyPpD0g0Z", "pdf_size": 0, "rating": "4;5;7", "confidence": "5;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": -0.9819805060619659, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6056235249597838643&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Cascade Adversarial Machine Learning Regularized with a Unified Embedding", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/324", "id": "HyRVBzap-", "author_site": "Taesik Na, Jong Hwan Ko, Saibal Mukhopadhyay", "tldr": "Cascade adversarial training + low level similarity learning improve robustness against both white box and black box attacks.", "abstract": "Injecting adversarial examples during training, known as adversarial training, can improve robustness against one-step attacks, but not for unknown iterative attacks. To address this challenge, we first show iteratively generated adversarial images easily transfer between networks trained with the same strategy. Inspired by this observation, we propose cascade adversarial training, which transfers the knowledge of the end results of adversarial training. We train a network from scratch by injecting iteratively generated adversarial images crafted from already defended networks in addition to one-step adversarial images from the network being trained. We also propose to utilize embedding space for both classification and low-level (pixel-level) similarity learning to ignore unknown pixel level perturbation. During training, we inject adversarial images without replacing their corresponding clean images and penalize the distance between the two embeddings (clean and adversarial). Experimental results show that cascade adversarial training together with our proposed low-level similarity learning efficiently enhances the robustness against iterative attacks, but at the expense of decreased robustness against one-step attacks. We show that combining those two techniques can also improve robustness under the worst case black box attack scenario.", "keywords": "adversarial machine learning;embedding;regularization;adversarial attack", "primary_area": "", "supplementary_material": "", "author": "Taesik Na;Jong Hwan Ko;Saibal Mukhopadhyay", "authorids": "taesik.na@gatech.edu;jonghwan.ko@gatech.edu;saibal.mukhopadhyay@ece.gatech.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nna2018cascade,\ntitle={Cascade Adversarial Machine Learning Regularized with a Unified Embedding},\nauthor={Taesik Na and Jong Hwan Ko and Saibal Mukhopadhyay},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HyRVBzap-},\n}", "github": "[![github](/images/github_icon.svg) taesikna/cascade_adv_training](https://github.com/taesikna/cascade_adv_training)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 126, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11749941240097246023&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HyRVBzap-", "pdf": "https://openreview.net/pdf?id=HyRVBzap-", "email": ";;", "author_num": 3 }, { "title": "Multi-Mention Learning for Reading Comprehension with Neural Cascades", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/95", "id": "HyRnez-RW", "author_site": "Swabha Swayamdipta, Ankur Parikh, Tom Kwiatkowski", "tldr": "We propose neural cascades, a simple and trivially parallelizable approach to reading comprehension, consisting only of feed-forward nets and attention that achieves state-of-the-art performance on the TriviaQA dataset.", "abstract": "Reading comprehension is a challenging task, especially when executed across longer or across multiple evidence documents, where the answer is likely to reoccur. Existing neural architectures typically do not scale to the entire evidence, and hence, resort to selecting a single passage in the document (either via truncation or other means), and carefully searching for the answer within that passage. However, in some cases, this strategy can be suboptimal, since by focusing on a specific passage, it becomes difficult to leverage multiple mentions of the same answer throughout the document. In this work, we take a different approach by constructing lightweight models that are combined in a cascade to find the answer. Each submodel consists only of feed-forward networks equipped with an attention mechanism, making it trivially parallelizable. We show that our approach can scale to approximately an order of magnitude larger evidence documents and can aggregate information from multiple mentions of each answer candidate across the document. Empirically, our approach achieves state-of-the-art performance on both the Wikipedia and web domains of the TriviaQA dataset, outperforming more complex, recurrent architectures.", "keywords": "reading comprehension;multi-loss;question answering;scalable;TriviaQA;feed-forward;latent variable;attention", "primary_area": "", "supplementary_material": "", "author": "Swabha Swayamdipta;Ankur P. Parikh;Tom Kwiatkowski", "authorids": "swabha@cs.cmu.edu;aparikh@google.com;tomkwiat@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nswayamdipta2018multimention,\ntitle={Multi-Mention Learning for Reading Comprehension with Neural Cascades},\nauthor={Swabha Swayamdipta and Ankur P. Parikh and Tom Kwiatkowski},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HyRnez-RW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8351416236501756468&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HyRnez-RW", "pdf": "https://openreview.net/pdf?id=HyRnez-RW", "email": ";;", "author_num": 3 }, { "id": "HyTrSegCb", "title": "Achieving morphological agreement with Concorde", "track": "main", "status": "Reject", "tldr": "Proposed architecture to solve morphological agreement task", "abstract": "Neural conversational models are widely used in applications like personal assistants and chat bots. These models seem to give better performance when operating on word level. However, for fusion languages like French, Russian and Polish vocabulary size sometimes become infeasible since most of the words have lots of word forms. We propose a neural network architecture for transforming normalized text into a grammatically correct one. Our model efficiently employs correspondence between normalized and target words and significantly outperforms character-level models while being 2x faster in training and 20\\% faster at evaluation. We also propose a new pipeline for building conversational models: first generate a normalized answer and then transform it into a grammatically correct one using our network. The proposed pipeline gives better performance than character-level conversational models according to assessor testing.", "keywords": "NLP;morphology;seq2seq", "primary_area": "", "supplementary_material": "", "author": "Daniil Polykovskiy;Dmitry Soloviev", "authorids": "daniil.polykovskiy@gmail.com;d.soloviev@corp.mail.ru", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\npolykovskiy2018achieving,\ntitle={Achieving morphological agreement with Concorde},\nauthor={Daniil Polykovskiy and Dmitry Soloviev},\nyear={2018},\nurl={https://openreview.net/forum?id=HyTrSegCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HyTrSegCb", "pdf_size": 0, "rating": "2;5;6", "confidence": "5;4;5", "rating_avg": 4.333333333333333, "confidence_avg": 4.666666666666667, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": -0.2773500981126146, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_enFmTqgs7kJ:scholar.google.com/&scioq=Achieving+morphological+agreement+with+Concorde&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Parallelizing Linear Recurrent Neural Nets Over Sequence Length", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/249", "id": "HyUNwulC-", "author_site": "Eric Martin, Christopher Cundy", "tldr": "use parallel scan to parallelize linear recurrent neural nets. train model on length 1 million dependency", "abstract": "Recurrent neural networks (RNNs) are widely used to model sequential data but\ntheir non-linear dependencies between sequence elements prevent parallelizing\ntraining over sequence length. We show the training of RNNs with only linear\nsequential dependencies can be parallelized over the sequence length using the\nparallel scan algorithm, leading to rapid training on long sequences even with\nsmall minibatch size. We develop a parallel linear recurrence CUDA kernel and\nshow that it can be applied to immediately speed up training and inference of\nseveral state of the art RNN architectures by up to 9x. We abstract recent work\non linear RNNs into a new framework of linear surrogate RNNs and develop a\nlinear surrogate model for the long short-term memory unit, the GILR-LSTM, that\nutilizes parallel linear recurrence. We extend sequence learning to new\nextremely long sequence regimes that were previously out of reach by\nsuccessfully training a GILR-LSTM on a synthetic sequence classification task\nwith a one million timestep dependency.\n", "keywords": "rnn;sequence;parallel;qrnn;sru;gilr;gilr-lstm", "primary_area": "", "supplementary_material": "", "author": "Eric Martin;Chris Cundy", "authorids": "eric@ericmart.in;chris.j.cundy@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nmartin2018parallelizing,\ntitle={Parallelizing Linear Recurrent Neural Nets Over Sequence Length},\nauthor={Eric Martin and Chris Cundy},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HyUNwulC-},\n}", "github": "[![github](/images/github_icon.svg) eamartin/parallelizing_linear_rnns](https://github.com/eamartin/parallelizing_linear_rnns)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;4;2", "rating_avg": 6.666666666666667, "confidence_avg": 3.0, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 115, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9088138984771485785&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HyUNwulC-", "pdf": "https://openreview.net/pdf?id=HyUNwulC-", "email": ";", "author_num": 2 }, { "id": "HyW0afxKM", "title": "Learning a set of interrelated tasks by using a succession of motor policies for a socially guided intrinsically motivated learner", "track": "main", "status": "Withdraw", "tldr": "The paper describes a strategic intrinsically motivated learning algorithm which tackles the learning of complex motor policies.", "abstract": "We propose an active learning algorithmic architecture, capable of organizing its learning process in order to achieve a field of complex tasks by learning sequences of primitive motor policies : Socially Guided Intrinsic Motivation with Procedure Babbling (SGIM-PB). The learner can generalize over its experience to continuously learn new outcomes, by choosing actively what and how to learn guided by empirical measures of its own progress. In this paper, we are considering the learning of a set of interrelated complex outcomes hierarchically organized.\n\nWe introduce a new framework called \"procedures\", which enables the autonomous discovery of how to combine previously learned skills in order to learn increasingly more complex motor policies (combinations of primitive motor policies). Our architecture can actively decide which outcome to focus on and which exploration strategy to apply. Those strategies could be autonomous exploration, or active social guidance, where it relies on the expertise of a human teacher providing demonstrations at the learner's request. We show on a simulated environment that our new architecture is capable of tackling the learning of complex motor policies, to adapt the complexity of its policies to the task at hand. We also show that our \"procedures\" increases the agent's capability to learn complex tasks.", "keywords": "developmental robotics;intrinsic motivation;strategic learning;complex motor policies", "primary_area": "", "supplementary_material": "", "author": "Nicolas Duminy;Sao Mai Nguyen;Dominique Duhaut", "authorids": "nicolas.duminy@telecom-bretagne.eu;nguyensmai@gmail.com;dominique.duhaut@univ-ubs.fr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=HyW0afxKM", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 0, "authors#_avg": 3, "corr_rating_confidence": 0, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7517977772321283133&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15 }, { "title": "Stochastic gradient descent performs variational inference, converges to limit cycles for deep networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/152", "id": "HyWrIgW0W", "author_site": "Pratik A Chaudhari, Stefano Soatto", "tldr": "SGD implicitly performs variational inference; gradient noise is highly non-isotropic, so SGD does not even converge to critical points of the original loss", "abstract": "Stochastic gradient descent (SGD) is widely believed to perform implicit regularization when used to train deep neural networks, but the precise manner in which this occurs has thus far been elusive. We prove that SGD minimizes an average potential over the posterior distribution of weights along with an entropic regularization term. This potential is however not the original loss function in general. So SGD does perform variational inference, but for a different loss than the one used to compute the gradients. Even more surprisingly, SGD does not even converge in the classical sense: we show that the most likely trajectories of SGD for deep networks do not behave like Brownian motion around critical points. Instead, they resemble closed loops with deterministic components. We prove that such out-of-equilibrium behavior is a consequence of highly non-isotropic gradient noise in SGD; the covariance matrix of mini-batch gradients for deep networks has a rank as small as 1% of its dimension. We provide extensive empirical validation of these claims, proven in the appendix.", "keywords": "sgd;variational inference;gradient noise;out-of-equilibrium", "primary_area": "", "supplementary_material": "", "author": "Pratik Chaudhari;Stefano Soatto", "authorids": "pratikac@ucla.edu;soatto@ucla.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nchaudhari2018stochastic,\ntitle={Stochastic gradient descent performs variational inference, converges to limit cycles for deep networks},\nauthor={Pratik Chaudhari and Stefano Soatto},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HyWrIgW0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "5;6;8", "confidence": "4;4;5", "rating_avg": 6.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": 0.944911182523068, "gs_citation": 375, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2003607680024773547&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=HyWrIgW0W", "pdf": "https://openreview.net/pdf?id=HyWrIgW0W", "email": ";", "author_num": 2 }, { "id": "HyXBcYg0b", "title": "Residual Gated Graph ConvNets", "track": "main", "status": "Reject", "tldr": "We compare graph RNNs and graph ConvNets, and we consider the most generic class of graph ConvNets with residuality.", "abstract": "Graph-structured data such as social networks, functional brain networks, gene regulatory networks, communications networks have brought the interest in generalizing deep learning techniques to graph domains. In this paper, we are interested to design neural networks for graphs with variable length in order to solve learning problems such as vertex classification, graph classification, graph regression, and graph generative tasks. Most existing works have focused on recurrent neural networks (RNNs) to learn meaningful representations of graphs, and more recently new convolutional neural networks (ConvNets) have been introduced. In this work, we want to compare rigorously these two fundamental families of architectures to solve graph learning tasks. We review existing graph RNN and ConvNet architectures, and propose natural extension of LSTM and ConvNet to graphs with arbitrary size. Then, we design a set of analytically controlled experiments on two basic graph problems, i.e. subgraph matching and graph clustering, to test the different architectures. Numerical results show that the proposed graph ConvNets are 3-17% more accurate and 1.5-4x faster than graph RNNs. Graph ConvNets are also 36% more accurate than variational (non-learning) techniques. Finally, the most effective graph ConvNet architecture uses gated edges and residuality. Residuality plays an essential role to learn multi-layer architectures as they provide a 10% gain of performance.", "keywords": "graph neural networks;ConvNets;RNNs;pattern matching;semi-supervised clustering", "primary_area": "", "supplementary_material": "", "author": "Xavier Bresson;Thomas Laurent", "authorids": "xbresson@ntu.edu.sg;tlaurent@lmu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nbresson2018residual,\ntitle={Residual Gated Graph ConvNets},\nauthor={Xavier Bresson and Thomas Laurent},\nyear={2018},\nurl={https://openreview.net/forum?id=HyXBcYg0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HyXBcYg0b", "pdf_size": 0, "rating": "3;6;7", "confidence": "4;3;4", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": -0.2773500981126145, "gs_citation": 589, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14809063263659708852&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "HyXNCZbCZ", "title": "Hierarchical Adversarially Learned Inference", "track": "main", "status": "Reject", "tldr": "Adversarially trained hierarchical generative model with robust and semantically learned latent representation.", "abstract": "We propose a novel hierarchical generative model with a simple Markovian structure and a corresponding inference model. Both the generative and inference model are trained using the adversarial learning paradigm. We demonstrate that the hierarchical structure supports the learning of progressively more abstract representations as well as providing semantically meaningful reconstructions with different levels of fidelity. Furthermore, we show that minimizing the Jensen-Shanon divergence between the generative and inference network is enough to minimize the reconstruction error. The resulting semantically meaningful hierarchical latent structure discovery is exemplified on the CelebA dataset. There, we show that the features learned by our model in an unsupervised way outperform the best handcrafted features. Furthermore, the extracted features remain competitive when compared to several recent deep supervised approaches on an attribute prediction task on CelebA. Finally, we leverage the model's inference network to achieve state-of-the-art performance on a semi-supervised variant of the MNIST digit classification task. ", "keywords": "generative;hierarchical;unsupervised;semisupervised;latent;ALI;GAN", "primary_area": "", "supplementary_material": "", "author": "Mohamed Ishmael Belghazi;Sai Rajeswar;Olivier Mastropietro;Negar Rostamzadeh;Jovana Mitrovic;Aaron Courville", "authorids": "ishmael.belghazi@gmail.com;rajsai24@gmail.com;oli.mastro@gmail.com;negar.rostamzadeh@gmail.com;jovana.mitrovic@spc.ox.ac.uk;aaron.courville@gmail.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nishmael2018hierarchical,\ntitle={Hierarchical Adversarially Learned Inference},\nauthor={Mohamed Ishmael Belghazi and Sai Rajeswar and Olivier Mastropietro and Negar Rostamzadeh and Jovana Mitrovic and Aaron Courville},\nyear={2018},\nurl={https://openreview.net/forum?id=HyXNCZbCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HyXNCZbCZ", "pdf_size": 0, "rating": "5;5;7", "confidence": "5;5;3", "rating_avg": 5.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 17, "authors#_avg": 6, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2264425702653590569&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "HyY0Ff-AZ", "title": "Representing Entropy : A short proof of the equivalence between soft Q-learning and policy gradients", "track": "main", "status": "Reject", "tldr": "A short proof of the equivalence of soft Q-learning and policy gradients.", "abstract": "Two main families of reinforcement learning algorithms, Q-learning and policy gradients, have recently been proven to be equivalent when using a softmax relaxation on one part, and an entropic regularization on the other. We relate this result to the well-known convex duality of Shannon entropy and the softmax function. Such a result is also known as the Donsker-Varadhan formula. This provides a short proof of the equivalence. We then interpret this duality further, and use ideas of convex analysis to prove a new policy inequality relative to soft Q-learning.", "keywords": "soft Q-learning;policy gradients;entropy;Legendre transformation;duality;convex analysis;Donsker-Varadhan", "primary_area": "", "supplementary_material": "", "author": "Pierre H. Richemond;Brendan Maginnis", "authorids": "phr17@imperial.ac.uk;b.maginnis@imperial.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nh.2018representing,\ntitle={Representing Entropy : A short proof of the equivalence between soft Q-learning and policy gradients},\nauthor={Pierre H. Richemond and Brendan Maginnis},\nyear={2018},\nurl={https://openreview.net/forum?id=HyY0Ff-AZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HyY0Ff-AZ", "pdf_size": 0, "rating": "2;5;5", "confidence": "5;5;4", "rating_avg": 4.0, "confidence_avg": 4.666666666666667, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": -0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:UhquaBYbJaEJ:scholar.google.com/&scioq=Representing+Entropy+:+A+short+proof+of+the+equivalence+between+soft+Q-learning+and+policy+gradients&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Debiasing Evidence Approximations: On Importance-weighted Autoencoders and Jackknife Variational Inference", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/114", "id": "HyZoi-WRb", "author_site": "Sebastian Nowozin", "tldr": "Variational inference is biased, let's debias it.", "abstract": "The importance-weighted autoencoder (IWAE) approach of Burda et al. defines a sequence of increasingly tighter bounds on the marginal likelihood of latent variable models. Recently, Cremer et al. reinterpreted the IWAE bounds as ordinary variational evidence lower bounds (ELBO) applied to increasingly accurate variational distributions. In this work, we provide yet another perspective on the IWAE bounds. We interpret each IWAE bound as a biased estimator of the true marginal likelihood where for the bound defined on $K$ samples we show the bias to be of order O(1/K). In our theoretical analysis of the IWAE objective we derive asymptotic bias and variance expressions. Based on this analysis we develop jackknife variational inference (JVI),\na family of bias-reduced estimators reducing the bias to $O(K^{-(m+1)})$ for any given m < K while retaining computational efficiency. Finally, we demonstrate that JVI leads to improved evidence estimates in variational autoencoders. We also report first results on applying JVI to learning variational autoencoders.\n\nOur implementation is available at https://github.com/Microsoft/jackknife-variational-inference", "keywords": "variational inference;approximate inference;generative models", "primary_area": "", "supplementary_material": "", "author": "Sebastian Nowozin", "authorids": "sebastian.nowozin@microsoft.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\nnowozin2018debiasing,\ntitle={Debiasing Evidence Approximations: On Importance-weighted Autoencoders and Jackknife Variational Inference},\nauthor={Sebastian Nowozin},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HyZoi-WRb},\n}", "github": "[![github](/images/github_icon.svg) Microsoft/jackknife-variational-inference](https://github.com/Microsoft/jackknife-variational-inference)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;3;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 13, "authors#_avg": 1, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9069832931054868249&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=HyZoi-WRb", "pdf": "https://openreview.net/pdf?id=HyZoi-WRb", "email": "", "author_num": 1 }, { "id": "Hy_o3x-0b", "title": "Feature Map Variational Auto-Encoders", "track": "main", "status": "Reject", "tldr": "We present a generative model that proves state-of-the-art results on gray-scale and natural images.", "abstract": "There have been multiple attempts with variational auto-encoders (VAE) to learn powerful global representations of complex data using a combination of latent stochastic variables and an autoregressive model over the dimensions of the data. However, for the most challenging natural image tasks the purely autoregressive model with stochastic variables still outperform the combined stochastic autoregressive models. In this paper, we present simple additions to the VAE framework that generalize to natural images by embedding spatial information in the stochastic layers. We significantly improve the state-of-the-art results on MNIST, OMNIGLOT, CIFAR10 and ImageNet when the feature map parameterization of the stochastic variables are combined with the autoregressive PixelCNN approach. Interestingly, we also observe close to state-of-the-art results without the autoregressive part. This opens the possibility for high quality image generation with only one forward-pass.\n", "keywords": "deep learning;representation learning;variational auto-encoders;variational inference;generative models", "primary_area": "", "supplementary_material": "", "author": "Lars Maal\u00f8e;Ole Winther", "authorids": "larsma@dtu.dk;olwi@dtu.dk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nmaal\u00f8e2018feature,\ntitle={Feature Map Variational Auto-Encoders},\nauthor={Lars Maal\u00f8e and Ole Winther},\nyear={2018},\nurl={https://openreview.net/forum?id=Hy_o3x-0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Hy_o3x-0b", "pdf_size": 0, "rating": "3;5;6", "confidence": "3;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 23, "authors#_avg": 2, "corr_rating_confidence": 0.9449111825230683, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1253543378002119254&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HydnA1WCb", "title": "Gaussian Prototypical Networks for Few-Shot Learning on Omniglot", "track": "main", "status": "Reject", "tldr": "A novel architecture for few-shot classification capable of dealing with uncertainty.", "abstract": "We propose a novel architecture for k-shot classification on the Omniglot dataset. Building on prototypical networks, we extend their architecture to what we call Gaussian prototypical networks. Prototypical networks learn a map between images and embedding vectors, and use their clustering for classification. In our model, a part of the encoder output is interpreted as a confidence region estimate about the embedding point, and expressed as a Gaussian covariance matrix. Our network then constructs a direction and class dependent distance metric on the embedding space, using uncertainties of individual data points as weights. We show that Gaussian prototypical networks are a preferred architecture over vanilla prototypical networks with an equivalent number of parameters. We report results consistent with state-of-the-art performance in 1-shot and 5-shot classification both in 5-way and 20-way regime on the Omniglot dataset. We explore artificially down-sampling a fraction of images in the training set, which improves our performance. Our experiments therefore lead us to hypothesize that Gaussian prototypical networks might perform better in less homogeneous, noisier datasets, which are commonplace in real world applications.", "keywords": "one-shot learning;few-shot learning;Omniglot", "primary_area": "", "supplementary_material": "", "author": "Stanislav Fort", "authorids": "sfort1@stanford.edu", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nfort2018gaussian,\ntitle={Gaussian Prototypical Networks for Few-Shot Learning on Omniglot},\nauthor={Stanislav Fort},\nyear={2018},\nurl={https://openreview.net/forum?id=HydnA1WCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HydnA1WCb", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 1, "corr_rating_confidence": 0.0, "gs_citation": 107, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3062380745530370175&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "title": "Memory Augmented Control Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/321", "id": "HyfHgI6aW", "author_site": "Arbaaz Khan, Clark Zhang, Nikolay Atanasov, Konstantinos Karydis, Vijay Kumar, Daniel D Lee", "tldr": "Memory Augmented Network to plan in partially observable environments. ", "abstract": "Planning problems in partially observable environments cannot be solved directly with convolutional networks and require some form of memory. But, even memory networks with sophisticated addressing schemes are unable to learn intelligent reasoning satisfactorily due to the complexity of simultaneously learning to access memory and plan. To mitigate these challenges we propose the Memory Augmented Control Network (MACN). The network splits planning into a hierarchical process. At a lower level, it learns to plan in a locally observed space. At a higher level, it uses a collection of policies computed on locally observed spaces to learn an optimal plan in the global environment it is operating in. The performance of the network is evaluated on path planning tasks in environments in the presence of simple and complex obstacles and in addition, is tested for its ability to generalize to new environments not seen in the training set.", "keywords": "planning;memory networks;deep learning;robotics", "primary_area": "", "supplementary_material": "", "author": "Arbaaz Khan;Clark Zhang;Nikolay Atanasov;Konstantinos Karydis;Vijay Kumar;Daniel D. Lee", "authorids": "arbaazk@seas.upenn.edu;clarkz@seas.upenn.edu;natanasov@ucsd.edu;konstantinos.karydis@ucr.edu;vijay.kumar@seas.upenn.edu;ddlee@seas.upenn.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nkhan2018memory,\ntitle={Memory Augmented Control Networks},\nauthor={Arbaaz Khan and Clark Zhang and Nikolay Atanasov and Konstantinos Karydis and Vijay Kumar and Daniel D. Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HyfHgI6aW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "4;6;9", "confidence": "5;2;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 6, "corr_rating_confidence": -0.21677749238103003, "gs_citation": 98, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13389575768683358971&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=HyfHgI6aW", "pdf": "https://openreview.net/pdf?id=HyfHgI6aW", "email": ";;;;;", "author_num": 6 }, { "title": "Generating Wikipedia by Summarizing Long Sequences", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/121", "id": "Hyg0vbWC-", "author_site": "Peter J Liu, Mohammad Saleh, Etienne Pot, Ben Goodrich, Ryan Sepassi, Lukasz Kaiser, Noam Shazeer", "tldr": "We generate Wikipedia articles abstractively conditioned on source document text.", "abstract": "We show that generating English Wikipedia articles can be approached as a multi-\ndocument summarization of source documents. We use extractive summarization\nto coarsely identify salient information and a neural abstractive model to generate\nthe article. For the abstractive model, we introduce a decoder-only architecture\nthat can scalably attend to very long sequences, much longer than typical encoder-\ndecoder architectures used in sequence transduction. We show that this model can\ngenerate fluent, coherent multi-sentence paragraphs and even whole Wikipedia\narticles. When given reference documents, we show it can extract relevant factual\ninformation as reflected in perplexity, ROUGE scores and human evaluations.", "keywords": "abstractive summarization;Transformer;long sequences;natural language processing;sequence transduction;Wikipedia;extractive summarization", "primary_area": "", "supplementary_material": "", "author": "Peter J. Liu*;Mohammad Saleh*;Etienne Pot;Ben Goodrich;Ryan Sepassi;Lukasz Kaiser;Noam Shazeer", "authorids": "peterjliu@google.com;msaleh@google.com;epot@google.com;bgoodrich@google.com;rsepassi@google.com;lukaszkaiser@google.com;noam@google.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nj.2018generating,\ntitle={Generating Wikipedia by Summarizing Long Sequences},\nauthor={Peter J. Liu* and Mohammad Saleh* and Etienne Pot and Ben Goodrich and Ryan Sepassi and Lukasz Kaiser and Noam Shazeer},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Hyg0vbWC-},\n}", "github": "[![github](/images/github_icon.svg) tensorflow/tensor2tensor](https://github.com/tensorflow/tensor2tensor) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=Hyg0vbWC-)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;5;3", "rating_avg": 7.333333333333333, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 7, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1080, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9480555348664414627&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=Hyg0vbWC-", "pdf": "https://openreview.net/pdf?id=Hyg0vbWC-", "email": ";;;;;;", "author_num": 7 }, { "title": "TD or not TD: Analyzing the Role of Temporal Differencing in Deep Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/190", "id": "HyiAuyb0b", "author_site": "Artemij Amiranashvili, Alexey Dosovitskiy, Vladlen Koltun, Thomas Brox", "tldr": "", "abstract": "Our understanding of reinforcement learning (RL) has been shaped by theoretical and empirical results that were obtained decades ago using tabular representations and linear function approximators. These results suggest that RL methods that use temporal differencing (TD) are superior to direct Monte Carlo estimation (MC). How do these results hold up in deep RL, which deals with perceptually complex environments and deep nonlinear models? In this paper, we re-examine the role of TD in modern deep RL, using specially designed environments that control for specific factors that affect performance, such as reward sparsity, reward delay, and the perceptual complexity of the task. When comparing TD with infinite-horizon MC, we are able to reproduce classic results in modern settings. Yet we also find that finite-horizon MC is not inferior to TD, even when rewards are sparse or delayed. This makes MC a viable alternative to TD in deep RL.", "keywords": "deep learning;reinforcement learning;temporal difference", "primary_area": "", "supplementary_material": "", "author": "Artemij Amiranashvili;Alexey Dosovitskiy;Vladlen Koltun;Thomas Brox", "authorids": "amiranas@cs.uni-freiburg.de;adosovitskiy@gmail.com;vkoltun@gmail.com;brox@cs.uni-freiburg.de", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\namiranashvili2018analyzing,\ntitle={Analyzing the Role of Temporal Differencing in Deep Reinforcement Learning},\nauthor={Artemij Amiranashvili and Alexey Dosovitskiy and Vladlen Koltun and Thomas Brox},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HyiAuyb0b},\n}", "github": "[![github](/images/github_icon.svg) lmb-freiburg/td-or-not-td](https://github.com/lmb-freiburg/td-or-not-td)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;4;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17309732018163861252&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=HyiAuyb0b", "pdf": "https://openreview.net/pdf?id=HyiAuyb0b", "email": ";;;", "author_num": 4 }, { "id": "HyiRazbRb", "title": "Demystifying overcomplete nonlinear auto-encoders: fast SGD convergence towards sparse representation from random initialization", "track": "main", "status": "Reject", "tldr": "theoretical analysis of nonlinear wide autoencoder", "abstract": "Auto-encoders are commonly used for unsupervised representation learning and for pre-training deeper neural networks.\nWhen its activation function is linear and the encoding dimension (width of hidden layer) is smaller than the input dimension, it is well known that auto-encoder is optimized to learn the principal components of the data distribution (Oja1982).\nHowever, when the activation is nonlinear and when the width is larger than the input dimension (overcomplete), auto-encoder behaves differently from PCA, and in fact is known to perform well empirically for sparse coding problems. \n\nWe provide a theoretical explanation for this empirically observed phenomenon, when rectified-linear unit (ReLu) is adopted as the activation function and the hidden-layer width is set to be large.\nIn this case, we show that, with significant probability, initializing the weight matrix of an auto-encoder by sampling from a spherical Gaussian distribution followed by stochastic gradient descent (SGD) training converges towards the ground-truth representation for a class of sparse dictionary learning models.\nIn addition, we can show that, conditioning on convergence, the expected convergence rate is O(1/t), where t is the number of updates.\nOur analysis quantifies how increasing hidden layer width helps the training performance when random initialization is used, and how the norm of network weights influence the speed of SGD convergence. ", "keywords": "stochastic gradient descent;autoencoders;nonconvex optimization;representation learning;theory", "primary_area": "", "supplementary_material": "", "author": "Cheng Tang;Claire Monteleoni", "authorids": "tangch@gwu.edu;cmontel@gwu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntang2018demystifying,\ntitle={Demystifying overcomplete nonlinear auto-encoders: fast {SGD} convergence towards sparse representation from random initialization},\nauthor={Cheng Tang and Claire Monteleoni},\nyear={2018},\nurl={https://openreview.net/forum?id=HyiRazbRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HyiRazbRb", "pdf_size": 0, "rating": "2;2;3", "confidence": "4;4;3", "rating_avg": 2.3333333333333335, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11504490872530552349&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "Hyig0zb0Z", "title": "Gated ConvNets for Letter-Based ASR", "track": "main", "status": "Reject", "tldr": "A letter-based ConvNet acoustic model leads to a simple and competitive speech recognition pipeline.", "abstract": "In this paper we introduce a new speech recognition system, leveraging a simple letter-based ConvNet acoustic model. The acoustic model requires only audio transcription for training -- no alignment annotations, nor any forced alignment step is needed. At inference, our decoder takes only a word list and a language model, and is fed with letter scores from the acoustic model -- no phonetic word lexicon is needed. Key ingredients for the acoustic model are Gated Linear Units and high dropout. We show near state-of-the-art results in word error rate on the LibriSpeech corpus with MFSC features, both on the clean and other configurations.\n", "keywords": "automatic speech recognition;letter-based acoustic model;gated convnets", "primary_area": "", "supplementary_material": "", "author": "Vitaliy Liptchinsky;Gabriel Synnaeve;Ronan Collobert", "authorids": "vitaliy888@fb.com;gab@fb.com;locronan@fb.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nliptchinsky2018gated,\ntitle={Gated ConvNets for Letter-Based {ASR}},\nauthor={Vitaliy Liptchinsky and Gabriel Synnaeve and Ronan Collobert},\nyear={2018},\nurl={https://openreview.net/forum?id=Hyig0zb0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Hyig0zb0Z", "pdf_size": 0, "rating": "3;4;6", "confidence": "5;4;5", "rating_avg": 4.333333333333333, "confidence_avg": 4.666666666666667, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.18898223650461357, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:UEAV_jQzuB4J:scholar.google.com/&scioq=Gated+ConvNets+for+Letter-Based+ASR&hl=en&as_sdt=0,14", "gs_version_total": 0 }, { "title": "Meta-Learning and Universality: Deep Representations and Gradient Descent can Approximate any Learning Algorithm", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/185", "id": "HyjC5yWCW", "author_site": "Chelsea Finn, Sergey Levine", "tldr": "Deep representations combined with gradient descent can approximate any learning algorithm.", "abstract": "Learning to learn is a powerful paradigm for enabling models to learn from data more effectively and efficiently. A popular approach to meta-learning is to train a recurrent model to read in a training dataset as input and output the parameters of a learned model, or output predictions for new test inputs. Alternatively, a more recent approach to meta-learning aims to acquire deep representations that can be effectively fine-tuned, via standard gradient descent, to new tasks. In this paper, we consider the meta-learning problem from the perspective of universality, formalizing the notion of learning algorithm approximation and comparing the expressive power of the aforementioned recurrent models to the more recent approaches that embed gradient descent into the meta-learner. In particular, we seek to answer the following question: does deep representation combined with standard gradient descent have sufficient capacity to approximate any learning algorithm? We find that this is indeed true, and further find, in our experiments, that gradient-based meta-learning consistently leads to learning strategies that generalize more widely compared to those represented by recurrent models.", "keywords": "meta-learning;learning to learn;universal function approximation", "primary_area": "", "supplementary_material": "", "author": "Chelsea Finn;Sergey Levine", "authorids": "cbfinn@eecs.berkeley.edu;svlevine@eecs.berkeley.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nfinn2018metalearning,\ntitle={Meta-Learning and Universality: Deep Representations and Gradient Descent can Approximate any Learning Algorithm},\nauthor={Chelsea Finn and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HyjC5yWCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "1;3;1", "rating_avg": 6.333333333333333, "confidence_avg": 1.6666666666666667, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 324, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2185954399232722694&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=HyjC5yWCW", "pdf": "https://openreview.net/pdf?id=HyjC5yWCW", "email": ";", "author_num": 2 }, { "id": "HylgYB3pZ", "title": "Linearly Constrained Weights: Resolving the Vanishing Gradient Problem by Reducing Angle Bias", "track": "main", "status": "Reject", "tldr": "We identify angle bias that causes the vanishing gradient problem in deep nets and propose an efficient method to reduce the bias.", "abstract": "In this paper, we first identify \\textit{angle bias}, a simple but remarkable phenomenon that causes the vanishing gradient problem in a multilayer perceptron (MLP) with sigmoid activation functions. We then propose \\textit{linearly constrained weights (LCW)} to reduce the angle bias in a neural network, so as to train the network under the constraints that the sum of the elements of each weight vector is zero. A reparameterization technique is presented to efficiently train a model with LCW by embedding the constraints on weight vectors into the structure of the network. Interestingly, batch normalization (Ioffe & Szegedy, 2015) can be viewed as a mechanism to correct angle bias. Preliminary experiments show that LCW helps train a 100-layered MLP more efficiently than does batch normalization.", "keywords": "vanishing gradient problem;multilayer perceptron;angle bias", "primary_area": "", "supplementary_material": "", "author": "Takuro Kutsuna", "authorids": "kutsuna@mosk.tytlabs.co.jp", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nkutsuna2018linearly,\ntitle={Linearly Constrained Weights: Resolving the Vanishing Gradient Problem by Reducing Angle Bias},\nauthor={Takuro Kutsuna},\nyear={2018},\nurl={https://openreview.net/forum?id=HylgYB3pZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HylgYB3pZ", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 1, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12051624509869820312&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "HymYLebCb", "title": "Network Signatures from Image Representation of Adjacency Matrices: Deep/Transfer Learning for Subgraph Classification", "track": "main", "status": "Reject", "tldr": "We convert subgraphs into structured images and classify them using 1. deep learning and 2. transfer learning (Caffe) and achieve stunning results.", "abstract": "We propose a novel subgraph image representation for classification of network fragments with the target being their parent networks. The graph image representation is based on 2D image embeddings of adjacency matrices. We use this image representation in two modes. First, as the input to a machine learning algorithm. Second, as the input to a pure transfer learner. Our conclusions from multiple datasets are that\n1. deep learning using structured image features performs the best compared to graph kernel and classical features based methods; and,\n2. pure transfer learning works effectively with minimum interference from the user and is robust against small data.\n", "keywords": "deep learning;transfer learning;adjacency matrices;image feature representation;Caffe;graph classification", "primary_area": "", "supplementary_material": "", "author": "Kshiteesh Hegde;Malik Magdon-Ismail;Ram Ramanathan;Bishal Thapa", "authorids": "hegdek2@rpi.edu;magdon@rpi.edu;ram@gotenna.com;bishal.thapa@raytheon.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nhegde2018network,\ntitle={Network Signatures from Image Representation of Adjacency Matrices: Deep/Transfer Learning for Subgraph Classification},\nauthor={Kshiteesh Hegde and Malik Magdon-Ismail and Ram Ramanathan and Bishal Thapa},\nyear={2018},\nurl={https://openreview.net/forum?id=HymYLebCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HymYLebCb", "pdf_size": 0, "rating": "3;6;6", "confidence": "3;3;3", "rating_avg": 5.0, "confidence_avg": 3.0, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15684498377502531391&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "Hymt27b0Z", "title": "MINE: Mutual Information Neural Estimation", "track": "main", "status": "Reject", "tldr": "A scalable in sample size and dimensions mutual information estimator.", "abstract": "This paper presents a Mutual Information Neural Estimator (MINE) that is linearly scalable in dimensionality as well as in sample size. MINE is back-propable and we prove that it is strongly consistent. We illustrate a handful of applications in which MINE is succesfully applied to enhance the property of generative models in both unsupervised and supervised settings. We apply our framework to estimate the information bottleneck, and apply it in tasks related to supervised classification problems. Our results demonstrate substantial added flexibility and improvement in these settings.\n", "keywords": "Deep Learning;Neural Networks;Information Theory;Generative models;GAN;Adversarial", "primary_area": "", "supplementary_material": "", "author": "Mohamed Ishmael Belghazi;Sai Rajeswar;Aristide Baratin;Devon Hjelm;Aaron Courville", "authorids": "ishmael.belghazi@gmail.com;rajsai24@gmail.com;aristidebaratin@hotmail.com;eronous@gmail.com;aaron.courville@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nishmael2018mine,\ntitle={{MINE}: Mutual Information Neural Estimation},\nauthor={Mohamed Ishmael Belghazi and Sai Rajeswar and Aristide Baratin and Devon Hjelm and Aaron Courville},\nyear={2018},\nurl={https://openreview.net/forum?id=Hymt27b0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Hymt27b0Z", "pdf_size": 0, "rating": "3;5;5", "confidence": "4;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 753, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4616043598945550206&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "HymuJz-A-", "title": "Not-So-CLEVR: Visual Relations Strain Feedforward Neural Networks", "track": "main", "status": "Workshop", "tldr": "Using a novel, controlled, visual-relation challenge, we show that same-different tasks critically strain the capacity of CNNs; we argue that visual relations can be better solved using attention-mnemonic strategies.", "abstract": "The robust and efficient recognition of visual relations in images is a hallmark of biological vision. Here, we argue that, despite recent progress in visual recognition, modern machine vision algorithms are severely limited in their ability to learn visual relations. Through controlled experiments, we demonstrate that visual-relation problems strain convolutional neural networks (CNNs). The networks eventually break altogether when rote memorization becomes impossible such as when the intra-class variability exceeds their capacity. We further show that another type of feedforward network, called a relational network (RN), which was shown to successfully solve seemingly difficult visual question answering (VQA) problems on the CLEVR datasets, suffers similar limitations. Motivated by the comparable success of biological vision, we argue that feedback mechanisms including working memory and attention are the key computational components underlying abstract visual reasoning.", "keywords": "Visual Relations;Visual Reasoning;SVRT;Attention;Working Memory;Convolutional Neural Network;Deep Learning;Relational Network", "primary_area": "", "supplementary_material": "", "author": "Junkyung Kim;Matthew Ricci;Thomas Serre", "authorids": "junkyung_kim@brown.edu;matthew_ricci_1@brown.edu;thomas_serre@brown.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nkim2018notsoclevr,\ntitle={Not-So-{CLEVR}: Visual Relations Strain Feedforward Neural Networks},\nauthor={Junkyung Kim and Matthew Ricci and Thomas Serre},\nyear={2018},\nurl={https://openreview.net/forum?id=HymuJz-A-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HymuJz-A-", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;4;3", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8901830691852799264&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "Hyp-JJJRW", "title": "Style Memory: Making a Classifier Network Generative", "track": "main", "status": "Reject", "tldr": "Augmenting the top layer of a classifier network with a style memory enables it to be generative.", "abstract": "Deep networks have shown great performance in classification tasks. However, the parameters learned by the classifier networks usually discard stylistic information of the input, in favour of information strictly relevant to classification. We introduce a network that has the capacity to do both classification and reconstruction by adding a \"style memory\" to the output layer of the network. We also show how to train such a neural network as a deep multi-layer autoencoder, jointly minimizing both classification and reconstruction losses. The generative capacity of our network demonstrates that the combination of style-memory neurons with the classifier neurons yield good reconstructions of the inputs when the classification is correct. We further investigate the nature of the style memory, and how it relates to composing digits and letters.", "keywords": "neural networks;autoencoder;generative;feed-back", "primary_area": "", "supplementary_material": "", "author": "Rey Wiyatno;Jeff Orchard", "authorids": "rrwiyatn@uwaterloo.ca;jorchard@uwaterloo.ca", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nwiyatno2018style,\ntitle={Style Memory: Making a Classifier Network Generative},\nauthor={Rey Wiyatno and Jeff Orchard},\nyear={2018},\nurl={https://openreview.net/forum?id=Hyp-JJJRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Hyp-JJJRW", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;5;3", "rating_avg": 3.3333333333333335, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=745601074668970189&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "Hyp3i2xRb", "title": "Overcoming the vanishing gradient problem in plain recurrent networks", "track": "main", "status": "Reject", "tldr": "We propose a novel network called the Recurrent Identity Network (RIN) which allows a plain recurrent network to overcome the vanishing gradient problem while training very deep models without the use of gates.", "abstract": "Plain recurrent networks greatly suffer from the vanishing gradient problem while Gated Neural Networks (GNNs) such as Long-short Term Memory (LSTM) and Gated Recurrent Unit (GRU) deliver promising results in many sequence learning tasks through sophisticated network designs. This paper shows how we can address this problem in a plain recurrent network by analyzing the gating mechanisms in GNNs. We propose a novel network called the Recurrent Identity Network (RIN) which allows a plain recurrent network to overcome the vanishing gradient problem while training very deep models without the use of gates. We compare this model with IRNNs and LSTMs on multiple sequence modeling benchmarks. The RINs demonstrate competitive performance and converge faster in all tasks. Notably, small RIN models produce 12%\u201367% higher accuracy on the Sequential and Permuted MNIST datasets and reach state-of-the-art performance on the bAbI question answering dataset.", "keywords": "vanishing gradient descent;recurrent neural networks;identity mapping", "primary_area": "", "supplementary_material": "", "author": "Yuhuang Hu;Adrian Huber;Shih-Chii Liu", "authorids": "yuhuang.hu@ini.uzh.ch;huberad@ini.uzh.ch;shih@ini.uzh.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nhu2018overcoming,\ntitle={Overcoming the vanishing gradient problem in plain recurrent networks},\nauthor={Yuhuang Hu and Adrian Huber and Shih-Chii Liu},\nyear={2018},\nurl={https://openreview.net/forum?id=Hyp3i2xRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Hyp3i2xRb", "pdf_size": 0, "rating": "2;4;7", "confidence": "4;5;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": -0.11470786693528084, "gs_citation": 175, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15647589269551455866&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "HypkN9yRW", "title": "DDRprog: A CLEVR Differentiable Dynamic Reasoning Programmer", "track": "main", "status": "Reject", "tldr": "A generic dynamic architecture that employs a problem specific differentiable forking mechanism to encode hard data structure assumptions. Applied to CLEVR VQA and expression evaluation.", "abstract": "We present a generic dynamic architecture that employs a problem specific differentiable forking mechanism to leverage discrete logical information about the problem data structure. We adapt and apply our model to CLEVR Visual Question Answering, giving rise to the DDRprog architecture; compared to previous approaches, our model achieves higher accuracy in half as many epochs with five times fewer learnable parameters. Our model directly models underlying question logic using a recurrent controller that jointly predicts and executes functional neural modules; it explicitly forks subprocesses to handle logical branching. While FiLM and other competitive models are static architectures with less supervision, we argue that inclusion of program labels enables learning of higher level logical operations -- our architecture achieves particularly high performance on questions requiring counting and integer comparison. We further demonstrate the generality of our approach though DDRstack -- an application of our method to reverse Polish notation expression evaluation in which the inclusion of a stack assumption allows our approach to generalize to long expressions, significantly outperforming an LSTM with ten times as many learnable parameters.", "keywords": "CLEVR;VQA;Visual Question Answering;Neural Programmer", "primary_area": "", "supplementary_material": "", "author": "Joseph Suarez;Justin Johnson;L. Fei-Fei", "authorids": "joseph15@stanford.edu;jcjohns@cs.stanford.edu;feifeili@cs.stanford.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsuarez2018ddrprog,\ntitle={{DDR}prog: A {CLEVR} Differentiable Dynamic Reasoning Programmer},\nauthor={Joseph Suarez and Justin Johnson and Fei-Fei Li},\nyear={2018},\nurl={https://openreview.net/forum?id=HypkN9yRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HypkN9yRW", "pdf_size": 0, "rating": "5;5;6", "confidence": "2;2;2", "rating_avg": 5.333333333333333, "confidence_avg": 2.0, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17347964160589395574&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Trust-PCL: An Off-Policy Trust Region Method for Continuous Control", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/167", "id": "HyrCWeWCb", "author_site": "Ofir Nachum, Mohammad Norouzi, Kelvin Xu, Dale Schuurmans", "tldr": "We extend recent insights related to softmax consistency to achieve state-of-the-art results in continuous control.", "abstract": "Trust region methods, such as TRPO, are often used to stabilize policy optimization algorithms in reinforcement learning (RL). While current trust region strategies are effective for continuous control, they typically require a large amount of on-policy interaction with the environment. To address this problem, we propose an off-policy trust region method, Trust-PCL, which exploits an observation that the optimal policy and state values of a maximum reward objective with a relative-entropy regularizer satisfy a set of multi-step pathwise consistencies along any path. The introduction of relative entropy regularization allows Trust-PCL to maintain optimization stability while exploiting off-policy data to improve sample efficiency. When evaluated on a number of continuous control tasks, Trust-PCL significantly improves the solution quality and sample efficiency of TRPO.", "keywords": "Reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Ofir Nachum;Mohammad Norouzi;Kelvin Xu;Dale Schuurmans", "authorids": "ofirnachum@google.com;mnorouzi@google.com;iamkelvinxu@gmail.com;schuurmans@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nnachum2018trustpcl,\ntitle={Trust-{PCL}: An Off-Policy Trust Region Method for Continuous Control},\nauthor={Ofir Nachum and Mohammad Norouzi and Kelvin Xu and Dale Schuurmans},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HyrCWeWCb},\n}", "github": "[![github](/images/github_icon.svg) tensorflow/models](https://github.com/tensorflow/models)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;1;4", "rating_avg": 5.333333333333333, "confidence_avg": 3.0, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": 0.49999999999999994, "gs_citation": 137, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11034633680493566157&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=HyrCWeWCb", "pdf": "https://openreview.net/pdf?id=HyrCWeWCb", "email": ";;;", "author_num": 4 }, { "id": "HytSvlWRZ", "title": "Subspace Network: Deep Multi-Task Censored Regression for Modeling Neurodegenerative Diseases", "track": "main", "status": "Reject", "tldr": "", "abstract": "Over the past decade a wide spectrum of machine learning models have been developed to model the neurodegenerative diseases, associating biomarkers, especially non-intrusive neuroimaging markers, with key clinical scores measuring the cognitive status of patients. Multi-task learning (MTL) has been extensively explored in these studies to address challenges associated to high dimensionality and small cohort size. However, most existing MTL approaches are based on linear models and suffer from two major limitations: 1) they cannot explicitly consider upper/lower bounds in these clinical scores; 2) they lack the capability to capture complicated non-linear effects among the variables. In this paper, we propose the Subspace Network, an efficient deep modeling approach for non-linear multi-task censored regression. Each layer of the subspace network performs a multi-task censored regression to improve upon the predictions from the last layer via sketching a low-dimensional subspace to perform knowledge transfer among learning tasks. We show that under mild assumptions, for each layer the parametric subspace can be recovered using only one pass of training data. In addition, empirical results demonstrate that the proposed subspace network quickly picks up correct parameter subspaces, and outperforms state-of-the-arts in predicting neurodegenerative clinical scores using information in brain imaging. ", "keywords": "subspace;censor;multi-task;deep network", "primary_area": "", "supplementary_material": "", "author": "Mengying Sun;Inci M. Baytas;Zhangyang Wang;Jiayu Zhou", "authorids": "sunmeng2@msu.edu;baytasin@msu.edu;atlaswang@tamu.edu;jiayuz@msu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsun2018subspace,\ntitle={Subspace Network: Deep Multi-Task Censored Regression for Modeling Neurodegenerative Diseases},\nauthor={Mengying Sun and Inci M. Baytas and Zhangyang Wang and Jiayu Zhou},\nyear={2018},\nurl={https://openreview.net/forum?id=HytSvlWRZ},\n}", "github": "[![github](/images/github_icon.svg) illidanlab/subspace-net](https://github.com/illidanlab/subspace-net)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HytSvlWRZ", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1396727227067303035&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "HyunpgbR-", "title": "Structured Exploration via Hierarchical Variational Policy Networks", "track": "main", "status": "Reject", "tldr": "Make deep reinforcement learning in large state-action spaces more efficient using structured exploration with deep hierarchical policies.", "abstract": "Reinforcement learning in environments with large state-action spaces is challenging, as exploration can be highly inefficient. Even if the dynamics are simple, the optimal policy can be combinatorially hard to discover. In this work, we propose a hierarchical approach to structured exploration to improve the sample efficiency of on-policy exploration in large state-action spaces. The key idea is to model a stochastic policy as a hierarchical latent variable model, which can learn low-dimensional structure in the state-action space, and to define exploration by sampling from the low-dimensional latent space. This approach enables lower sample complexity, while preserving policy expressivity. In order to make learning tractable, we derive a joint learning and exploration strategy by combining hierarchical variational inference with actor-critic learning. The benefits of our learning approach are that 1) it is principled, 2) simple to implement, 3) easily scalable to settings with many actions and 4) easily composable with existing deep learning approaches. We demonstrate the effectiveness of our approach on learning a deep centralized multi-agent policy, as multi-agent environments naturally have an exponentially large state-action space. In this setting, the latent hierarchy implements a form of multi-agent coordination during exploration and execution (MACE). We demonstrate empirically that MACE can more efficiently learn optimal policies in challenging multi-agent games with a large number (~20) of agents, compared to conventional baselines. Moreover, we show that our hierarchical structure leads to meaningful agent coordination.", "keywords": "Deep Reinforcement Learning;Structured Variational Inference;Multi-agent Coordination;Multi-agent Learning", "primary_area": "", "supplementary_material": "", "author": "Stephan Zheng;Yisong Yue", "authorids": "stephan@caltech.edu;yyue@caltech.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nzheng2018structured,\ntitle={Structured Exploration via Hierarchical Variational Policy Networks},\nauthor={Stephan Zheng and Yisong Yue},\nyear={2018},\nurl={https://openreview.net/forum?id=HyunpgbR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HyunpgbR-", "pdf_size": 0, "rating": "4;5;7", "confidence": "5;3;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": -0.7559289460184546, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8273001540064824137&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "HyxSG1Z3IX", "title": "Withdrawn", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Conditional sequence generation is a widely researched topic. One of the most important tasks is dialogue generation, which is composed of input-output pairs with the one-to-many property. Given the recent success of generative adversarial networks (GANs), GANs have been used for sequence generation. However, there is still limited work of its application on conditional sequence generation. We investigate the influence of GAN on conditional sequence generation with three artificial grammars and dialogue generation. Moreover, we propose stepwise GAN (StepGAN) for conditional sequence generation, which predicts the reward at each time-step. StepGAN can be seen as the general version of SeqGAN. It estimates the expected returns predicted by Monte-Carlo Search in SeqGAN, but it has a lower computational cost than Monte-Carlo Search. Experimental results show that stepwise GAN can outperform other state-of-the-art algorithms in most tasks.", "keywords": "conditional sequence generation;generative adversarial network;REINFORCE;dialogue generation", "primary_area": "", "supplementary_material": "", "author": "Yi-Lin Tuan;Hung-yi Lee", "authorids": "pascaltuan@gmail.com;hungyilee@ntu.edu.tw", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=HyxSG1Z3IX", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 0, "authors#_avg": 2, "corr_rating_confidence": 0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "HyxjwgbRZ", "title": "Convergence rate of sign stochastic gradient descent for non-convex functions", "track": "main", "status": "Reject", "tldr": "We prove a non-convex convergence rate for the sign stochastic gradient method. The algorithm has links to algorithms like Adam and Rprop, as well as gradient quantisation schemes used in distributed machine learning.", "abstract": "The sign stochastic gradient descent method (signSGD) utilizes only the sign of the stochastic gradient in its updates. Since signSGD carries out one-bit quantization of the gradients, it is extremely practical for distributed optimization where gradients need to be aggregated from different processors. For the first time, we establish convergence rates for signSGD on general non-convex functions under transparent conditions. We show that the rate of signSGD to reach first-order critical points matches that of SGD in terms of number of stochastic gradient calls, up to roughly a linear factor in the dimension. We carry out simple experiments to explore the behaviour of sign gradient descent (without the stochasticity) close to saddle points and show that it often helps completely avoid them without using either stochasticity or curvature information.", "keywords": "sign;stochastic;gradient;non-convex;optimization;gradient;quantization;convergence;rate", "primary_area": "", "supplementary_material": "", "author": "Jeremy Bernstein;Kamyar Azizzadenesheli;Yu-Xiang Wang;Anima Anandkumar", "authorids": "bernstein@caltech.edu;kazizzad@uci.edu;yuxiangw@cs.cmu.edu;animakumar@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbernstein2018convergence,\ntitle={Convergence rate of sign stochastic gradient descent for non-convex functions},\nauthor={Jeremy Bernstein and Kamyar Azizzadenesheli and Yu-Xiang Wang and Anima Anandkumar},\nyear={2018},\nurl={https://openreview.net/forum?id=HyxjwgbRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HyxjwgbRZ", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;5", "rating_avg": 4.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 21, "authors#_avg": 4, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12803113597223497967&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "HyyHX4gZM", "title": "Towards Quantum Inspired Convolution Networks", "track": "main", "status": "Withdraw", "tldr": "A quantum inspired kernel for convolution network, exhibiting interference phenomena, can be very useful (and compared it with real value counterpart).", "abstract": "Deep Convolution Neural Networks (CNNs), rooted by the pioneer work of \\cite{Hinton1986,LeCun1985,Alex2012}, and summarized in \\cite{LeCunBengioHinton2015}, have been shown to be very useful in a variety of fields. The state-of-the art CNN machines such as image rest net \\cite{He_2016_CVPR} are described by real value inputs and kernel convolutions followed by the local and non-linear rectified linear outputs. Understanding the role of these layers, the accuracy and limitations of them, as well as making them more efficient (fewer parameters) are all ongoing research questions. \n \n Inspired in quantum theory, we propose the use of complex value kernel functions, followed by the local non-linear absolute (modulus) operator square. We argue that an advantage of quantum inspired complex kernels is robustness to realistic unpredictable scenarios (such as clutter noise, data deformations). We study a concrete problem of shape detection and show that when multiple overlapping shapes are deformed and/or clutter noise is added, a convolution layer with quantum inspired complex kernels outperforms the statistical/classical kernel counterpart and a \"Bayesian shape estimator\" . The superior performance is due to the quantum phenomena of interference, not present in classical CNNs. ", "keywords": "quantum technique;convolution networks;shape detection", "primary_area": "", "supplementary_material": "", "author": "Davi Geiger;Zvi Kedem", "authorids": "dg1@nyu.edu;kedem@nyu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HyyHX4gZM", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;3;3", "rating_avg": 4.0, "confidence_avg": 3.6666666666666665, "replies_avg": 6, "authors#_avg": 2, "corr_rating_confidence": -0.8660254037844387, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:u5izsi3lfN8J:scholar.google.com/&scioq=Towards+Quantum+Inspired+Convolution+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Activation Maximization Generative Adversarial Nets", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/221", "id": "HyyP33gAZ", "author_site": "Zhiming Zhou, Han Cai, Shu Rong, Yuxuan Song, Kan Ren, Weinan Zhang, Jun Wang, Yong Yu", "tldr": "Understand how class labels help GAN training. Propose a new evaluation metric for generative models. ", "abstract": "Class labels have been empirically shown useful in improving the sample quality of generative adversarial nets (GANs). In this paper, we mathematically study the properties of the current variants of GANs that make use of class label information. With class aware gradient and cross-entropy decomposition, we reveal how class labels and associated losses influence GAN's training. Based on that, we propose Activation Maximization Generative Adversarial Networks (AM-GAN) as an advanced solution. Comprehensive experiments have been conducted to validate our analysis and evaluate the effectiveness of our solution, where AM-GAN outperforms other strong baselines and achieves state-of-the-art Inception Score (8.91) on CIFAR-10. In addition, we demonstrate that, with the Inception ImageNet classifier, Inception Score mainly tracks the diversity of the generator, and there is, however, no reliable evidence that it can reflect the true sample quality. We thus propose a new metric, called AM Score, to provide more accurate estimation on the sample quality. Our proposed model also outperforms the baseline methods in the new metric.", "keywords": "Generative Adversarial Nets;GANs;Evaluation Metrics;Generative Model;Deep Learning;Adversarial Learning;Inception Score;AM Score", "primary_area": "", "supplementary_material": "", "author": "Zhiming Zhou;Han Cai;Shu Rong;Yuxuan Song;Kan Ren;Weinan Zhang;Jun Wang;Yong Yu", "authorids": "heyohai@apex.sjtu.edu.cn;hcai@apex.sjtu.edu.cn;shu.rong@yitu-inc.com;songyuxuan@apex.sjtu.edu.cn;kren@apex.sjtu.edu.cn;wnzhang@sjtu.edu.cn;j.wang@cs.ucl.ac.uk;yyu@apex.sjtu.edu.cn", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\nzhou2018activation,\ntitle={Activation Maximization Generative Adversarial Nets},\nauthor={Zhiming Zhou and Han Cai and Shu Rong and Yuxuan Song and Kan Ren and Weinan Zhang and Jun Wang and Yong Yu},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HyyP33gAZ},\n}", "github": "[![github](/images/github_icon.svg) ZhimingZhou/AM-GAN](https://github.com/ZhimingZhou/AM-GAN) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=HyyP33gAZ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;7;8", "confidence": "4;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 112, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5158804099762139876&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=HyyP33gAZ", "pdf": "https://openreview.net/pdf?id=HyyP33gAZ", "email": ";;;;;;;", "author_num": 8 }, { "title": "Spatially Transformed Adversarial Examples", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/18", "id": "HyydRMZC-", "author_site": "Chaowei Xiao, Jun-Yan Zhu, Bo Li, Warren He, Mingyan Liu, Dawn Song", "tldr": "We propose a new approach for generating adversarial examples based on spatial transformation, which produces perceptually realistic examples compared to existing attacks. ", "abstract": "Recent studies show that widely used Deep neural networks (DNNs) are vulnerable to the carefully crafted adversarial examples.\nMany advanced algorithms have been proposed to generate adversarial examples by leveraging the L_p distance for penalizing perturbations.\nDifferent defense methods have also been explored to defend against such adversarial attacks. \nWhile the effectiveness of L_p distance as a metric of perceptual quality remains an active research area, in this paper we will instead focus on a different type of perturbation, namely spatial transformation, as opposed to manipulating the pixel values directly as in prior works.\nPerturbations generated through spatial transformation could result in large L_p distance measures, but our extensive experiments show that such spatially transformed adversarial examples are perceptually realistic and more difficult to defend against with existing defense systems. This potentially provides a new direction in adversarial example generation and the design of corresponding defenses.\nWe visualize the spatial transformation based perturbation for different examples and show that our technique\ncan produce realistic adversarial examples with smooth image deformation.\nFinally, we visualize the attention of deep networks with different types of adversarial examples to better understand how these examples are interpreted.", "keywords": "adversarial examples;spatial transformation", "primary_area": "", "supplementary_material": "", "author": "Chaowei Xiao;Jun-Yan Zhu;Bo Li;Warren He;Mingyan Liu;Dawn Song", "authorids": "xiaocw@umich.edu;junyanzhu89@gmail.com;lxbosky@gmail.com;_w@eecs.berkeley.edu;mingyan@umich.edu;dawnsong.travel@gmail.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nxiao2018spatially,\ntitle={Spatially Transformed Adversarial Examples},\nauthor={Chaowei Xiao and Jun-Yan Zhu and Bo Li and Warren He and Mingyan Liu and Dawn Song},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HyydRMZC-},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=HyydRMZC-)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "7;7;9", "confidence": "4;4;5", "rating_avg": 7.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 25, "authors#_avg": 6, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 648, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13716143567515510436&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=HyydRMZC-", "pdf": "https://openreview.net/pdf?id=HyydRMZC-", "email": ";;;;;", "author_num": 6 }, { "title": "Learn to Pay Attention", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/34", "id": "HyzbhfWRW", "author_site": "Saumya Jetley, Nicholas A. Lord, Namhoon Lee, Philip Torr", "tldr": "The paper proposes a method for forcing CNNs to leverage spatial attention in learning more object-centric representations that perform better in various respects.", "abstract": "We propose an end-to-end-trainable attention module for convolutional neural network (CNN) architectures built for image classification. The module takes as input the 2D feature vector maps which form the intermediate representations of the input image at different stages in the CNN pipeline, and outputs a 2D matrix of scores for each map. Standard CNN architectures are modified through the incorporation of this module, and trained under the constraint that a convex combination of the intermediate 2D feature vectors, as parametrised by the score matrices, must alone be used for classification. Incentivised to amplify the relevant and suppress the irrelevant or misleading, the scores thus assume the role of attention values. Our experimental observations provide clear evidence to this effect: the learned attention maps neatly highlight the regions of interest while suppressing background clutter. Consequently, the proposed function is able to bootstrap standard CNN architectures for the task of image classification, demonstrating superior generalisation over 6 unseen benchmark datasets. When binarised, our attention maps outperform other CNN-based attention maps, traditional saliency maps, and top object proposals for weakly supervised segmentation as demonstrated on the Object Discovery dataset. We also demonstrate improved robustness against the fast gradient sign method of adversarial attack.", "keywords": "deep learning;attention-aware representations;image classification;weakly supervised segmentation;domain shift;classifier generalisation;robustness to adversarial attack", "primary_area": "", "supplementary_material": "", "author": "Saumya Jetley;Nicholas A. Lord;Namhoon Lee;Philip H. S. Torr", "authorids": "saumya.jetley@stx.ox.ac.uk;nicklord@robots.ox.ac.uk;namhoon.lee@eng.ox.ac.uk;philip.torr@eng.ox.ac.uk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\njetley2018learn,\ntitle={Learn to Pay Attention},\nauthor={Saumya Jetley and Nicholas A. Lord and Namhoon Lee and Philip Torr},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=HyzbhfWRW},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 4 community implementations](https://paperswithcode.com/paper/?openreview=HyzbhfWRW)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 18, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 596, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9380871404199103044&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=HyzbhfWRW", "pdf": "https://openreview.net/pdf?id=HyzbhfWRW", "email": ";;;", "author_num": 4 }, { "id": "S1347ot3b", "title": "Exploring Sentence Vectors Through Automatic Summarization", "track": "main", "status": "Reject", "tldr": "A comparison and detailed analysis of various sentence embedding models through the real-world task of automatic summarization.", "abstract": "Vector semantics, especially sentence vectors, have recently been used successfully in many areas of natural language processing. However, relatively little work has explored the internal structure and properties of spaces of sentence vectors. In this paper, we will explore the properties of sentence vectors by studying a particular real-world application: Automatic Summarization. In particular, we show that cosine similarity between sentence vectors and document vectors is strongly correlated with sentence importance and that vector semantics can identify and correct gaps between the sentences chosen so far and the document. In addition, we identify specific dimensions which are linked to effective summaries. To our knowledge, this is the first time specific dimensions of sentence embeddings have been connected to sentence properties. We also compare the features of different methods of sentence embeddings. Many of these insights have applications in uses of sentence embeddings far beyond summarization.", "keywords": "Sentence Vectors;Vector Semantics;Automatic Summarization", "primary_area": "", "supplementary_material": "", "author": "Adly Templeton;Jugal Kalita", "authorids": "at7@williams.edu;jkalita@uccs.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntempleton2018exploring,\ntitle={Exploring Sentence Vectors Through Automatic Summarization},\nauthor={Adly Templeton and Jugal Kalita},\nyear={2018},\nurl={https://openreview.net/forum?id=S1347ot3b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=S1347ot3b", "pdf_size": 0, "rating": "2;2;3", "confidence": "5;5;5", "rating_avg": 2.3333333333333335, "confidence_avg": 5.0, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jBExBtVBWOwJ:scholar.google.com/&scioq=Exploring+Sentence+Vectors+Through+Automatic+Summarization&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "S14EogZAZ", "title": "Acquiring Target Stacking Skills by Goal-Parameterized Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Understanding physical phenomena is a key component of human intelligence and enables physical interaction with previously unseen environments. In this paper, we study how an artificial agent can autonomously acquire this intuition through interaction with the environment. We created a synthetic block stacking environment with physics simulation in which the agent can learn a policy end-to-end through trial and error. Thereby, we bypass to explicitly model physical knowledge within the policy. We are specifically interested in tasks that require the agent to reach a given goal state that may be different for every new trial. To this end, we propose a deep reinforcement learning framework that learns policies which are parametrized by a goal. We validated the model on a toy example navigating in a grid world with different target positions and in a block stacking task with different target structures of the final tower. In contrast to prior work, our policies show better generalization across different goals.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenbin Li;Jeannette Bohg;Mario Fritz", "authorids": "wenbinli@mpi-inf.mpg.de;bohg@stanford.edu;mfritz@mpi-inf.mpg.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nli2018acquiring,\ntitle={Acquiring Target Stacking Skills by Goal-Parameterized Deep Reinforcement Learning},\nauthor={Wenbin Li and Jeannette Bohg and Mario Fritz},\nyear={2018},\nurl={https://openreview.net/forum?id=S14EogZAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=S14EogZAZ", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;3;4", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9838152559510191457&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "S1680_1Rb", "title": "CAYLEYNETS: SPECTRAL GRAPH CNNS WITH COMPLEX RATIONAL FILTERS", "track": "main", "status": "Reject", "tldr": "A spectral graph convolutional neural network with spectral zoom properties.", "abstract": "The rise of graph-structured data such as social networks, regulatory networks, citation graphs, and functional brain networks, in combination with resounding success of deep learning in various applications, has brought the interest in generalizing deep learning models to non-Euclidean domains. \nIn this paper, we introduce a new spectral domain convolutional architecture for deep learning on graphs. The core ingredient of our model is a new class of parametric rational complex functions (Cayley polynomials) allowing to efficiently compute spectral filters on graphs that specialize on frequency bands of interest. Our model generates rich spectral filters that are localized in space, scales linearly with the size of the input data for sparsely-connected graphs, and can handle different constructions of Laplacian operators. Extensive experimental results show the superior performance of our approach on spectral image classification, community detection, vertex classification and matrix completion tasks.", "keywords": "Deep Learning;Spectral Graph Convolutional Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Ron Levie;Federico Monti;Xavier Bresson;Michael M. Bronstein", "authorids": "ronlevie@gmail.com;federico.monti@usi.ch;xavier.bresson@gmail.com;michael.bronstein@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlevie2018cayleynets,\ntitle={{CAYLEYNETS}: {SPECTRAL} {GRAPH} {CNNS} {WITH} {COMPLEX} {RATIONAL} {FILTERS}},\nauthor={Ron Levie and Federico Monti and Xavier Bresson and Michael M. Bronstein},\nyear={2018},\nurl={https://openreview.net/forum?id=S1680_1Rb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=S1680_1Rb", "pdf_size": 0, "rating": "4;6;8", "confidence": "3;3;3", "rating_avg": 6.0, "confidence_avg": 3.0, "replies_avg": 19, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:1o_nL23tBUYJ:scholar.google.com/&scioq=CAYLEYNETS:+SPECTRAL+GRAPH+CNNS+WITH+COMPLEX+RATIONAL+FILTERS&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "S16FPMgRZ", "title": "Tensor Contraction & Regression Networks", "track": "main", "status": "Reject", "tldr": "We propose tensor contraction and low-rank tensor regression layers to preserve and leverage the multi-linear structure throughout the network, resulting in huge space savings with little to no impact on performance.", "abstract": "Convolution neural networks typically consist of many convolutional layers followed by several fully-connected layers. While convolutional layers map between high-order activation tensors, the fully-connected layers operate on flattened activation vectors. Despite its success, this approach has notable drawbacks. Flattening discards the multi-dimensional structure of the activations, and the fully-connected layers require a large number of parameters. \nWe present two new techniques to address these problems. First, we introduce tensor contraction layers which can replace the ordinary fully-connected layers in a neural network. Second, we introduce tensor regression layers, which express the output of a neural network as a low-rank multi-linear mapping from a high-order activation tensor to the softmax layer. Both the contraction and regression weights are learned end-to-end by backpropagation. By imposing low rank on both, we use significantly fewer parameters. Experiments on the ImageNet dataset show that applied to the popular VGG and ResNet architectures, our methods significantly reduce the number of parameters in the fully connected layers (about 65% space savings) while negligibly impacting accuracy.", "keywords": "tensor contraction;tensor regression;network compression;deep neural networks", "primary_area": "", "supplementary_material": "", "author": "Jean Kossaifi;Zack Chase Lipton;Aran Khanna;Tommaso Furlanello;Anima Anandkumar", "authorids": "jean.kossaifi@gmail.com;zlipton@cmu.edu;arankhan@amazon.com;tfurlanello@gmail.com;animakumar@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nkossaifi2018tensor,\ntitle={Tensor Contraction & Regression Networks},\nauthor={Jean Kossaifi and Zack Chase Lipton and Aran Khanna and Tommaso Furlanello and Anima Anandkumar},\nyear={2018},\nurl={https://openreview.net/forum?id=S16FPMgRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=S16FPMgRZ", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;3;4", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 5, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5725198021159581574&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "S17mtzbRb", "title": "Forced Apart: Discovering Disentangled Representations Without Exhaustive Labels", "track": "main", "status": "Reject", "tldr": "A novel loss component that forces the network to learn a representation that is well-suited for clustering during training for a classification task.", "abstract": "Learning a better representation with neural networks is a challenging problem, which has been tackled from different perspectives in the past few years. In this work, we focus on learning a representation that would be useful in a clustering task. We introduce two novel loss components that substantially improve the quality of produced clusters, are simple to apply to arbitrary models and cost functions, and do not require a complicated training procedure. We perform an extensive set of experiments, supervised and unsupervised, and evaluate the proposed loss components on two most common types of models, Recurrent Neural Networks and Convolutional Neural Networks, showing that the approach we propose consistently improves the quality of KMeans clustering in terms of mutual information scores and outperforms previously proposed methods.", "keywords": "learning representation;clustering;loss", "primary_area": "", "supplementary_material": "", "author": "Alexey Romanov;Anna Rumshisky", "authorids": "jgc128@outlook.com;arum@cs.uml.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nromanov2018forced,\ntitle={Forced Apart: Discovering Disentangled Representations Without Exhaustive Labels},\nauthor={Alexey Romanov and Anna Rumshisky},\nyear={2018},\nurl={https://openreview.net/forum?id=S17mtzbRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=S17mtzbRb", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;5;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14554261743007923545&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "title": "Thermometer Encoding: One Hot Way To Resist Adversarial Examples", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/120", "id": "S18Su--CW", "author_site": "Jacob Buckman, Aurko Roy, Colin Raffel, Ian Goodfellow", "tldr": "Input discretization leads to robustness against adversarial examples", "abstract": "It is well known that it is possible to construct \"adversarial examples\"\nfor neural networks: inputs which are misclassified by the network\nyet indistinguishable from true data. We propose a simple\nmodification to standard neural network architectures, thermometer\nencoding, which significantly increases the robustness of the network to\nadversarial examples. We demonstrate this robustness with experiments\non the MNIST, CIFAR-10, CIFAR-100, and SVHN datasets, and show that\nmodels with thermometer-encoded inputs consistently have higher accuracy\non adversarial examples, without decreasing generalization.\nState-of-the-art accuracy under the strongest known white-box attack was \nincreased from 93.20% to 94.30% on MNIST and 50.00% to 79.16% on CIFAR-10.\nWe explore the properties of these networks, providing evidence\nthat thermometer encodings help neural networks to\nfind more-non-linear decision boundaries.", "keywords": "Adversarial examples;robust neural networks", "primary_area": "", "supplementary_material": "", "author": "Jacob Buckman;Aurko Roy;Colin Raffel;Ian Goodfellow", "authorids": "buckman@google.com;aurkor@google.com;craffel@google.com;goodfellow@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nbuckman2018thermometer,\ntitle={Thermometer Encoding: One Hot Way To Resist Adversarial Examples},\nauthor={Jacob Buckman and Aurko Roy and Colin Raffel and Ian Goodfellow},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=S18Su--CW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;2", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 771, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14437133120740920933&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=S18Su--CW", "pdf": "https://openreview.net/pdf?id=S18Su--CW", "email": ";;;", "author_num": 4 }, { "id": "S191YzbRZ", "title": "Prototype Matching Networks for Large-Scale Multi-label Genomic Sequence Classification", "track": "main", "status": "Reject", "tldr": "We combine the matching network framework for few shot learning into a large scale multi-label model for genomic sequence classification.", "abstract": "One of the fundamental tasks in understanding genomics is the problem of predicting Transcription Factor Binding Sites (TFBSs). With more than hundreds of Transcription Factors (TFs) as labels, genomic-sequence based TFBS prediction is a challenging multi-label classification task. There are two major biological mechanisms for TF binding: (1) sequence-specific binding patterns on genomes known as \u201cmotifs\u201d and (2) interactions among TFs known as co-binding effects. In this paper, we propose a novel deep architecture, the Prototype Matching Network (PMN) to mimic the TF binding mechanisms. Our PMN model automatically extracts prototypes (\u201cmotif\u201d-like features) for each TF through a novel prototype-matching loss. Borrowing ideas from few-shot matching models, we use the notion of support set of prototypes and an LSTM to learn how TFs interact and bind to genomic sequences. On a reference TFBS dataset with 2.1 million genomic sequences, PMN significantly outperforms baselines and validates our design choices empirically. To our knowledge, this is the first deep learning architecture that introduces prototype learning and considers TF-TF interactions for large scale TFBS prediction. Not only is the proposed architecture accurate, but it also models the underlying biology.", "keywords": "bioinformatics;multi-label classification;matching networks;prototypes;memory networks;attention", "primary_area": "", "supplementary_material": "", "author": "Jack Lanchantin;Arshdeep Sekhon;Ritambhara Singh;Yanjun Qi", "authorids": "jjl5sw@virginia.edu;as5cu@virginia.edu;rs3zz@virginia.edu;yq2h@virginia.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlanchantin2018prototype,\ntitle={Prototype Matching Networks for Large-Scale Multi-label Genomic Sequence Classification},\nauthor={Jack Lanchantin and Arshdeep Sekhon and Ritambhara Singh and Yanjun Qi},\nyear={2018},\nurl={https://openreview.net/forum?id=S191YzbRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=S191YzbRZ", "pdf_size": 0, "rating": "5;5;5", "confidence": "5;4;3", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 19, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5138396679758832016&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Alternating Multi-bit Quantization for Recurrent Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/235", "id": "S19dR9x0b", "author_site": "Chen Xu, Jianqiang Yao, Zhouchen Lin, Baigui Sun, Yuanbin Cao, Zhirong Wang, Hongbin Zha", "tldr": "We propose a new quantization method and apply it to quantize RNNs for both compression and acceleration", "abstract": "Recurrent neural networks have achieved excellent performance in many applications. However, on portable devices with limited resources, the models are often too large to deploy. For applications on the server with large scale concurrent requests, the latency during inference can also be very critical for costly computing resources. In this work, we address these problems by quantizing the network, both weights and activations, into multiple binary codes {-1,+1}. We formulate the quantization as an optimization problem. Under the key observation that once the quantization coefficients are fixed the binary codes can be derived efficiently by binary search tree, alternating minimization is then applied. We test the quantization for two well-known RNNs, i.e., long short term memory (LSTM) and gated recurrent unit (GRU), on the language models. Compared with the full-precision counter part, by 2-bit quantization we can achieve ~16x memory saving and ~6x real inference acceleration on CPUs, with only a reasonable loss in the accuracy. By 3-bit quantization, we can achieve almost no loss in the accuracy or even surpass the original model, with ~10.5x memory saving and ~3x real inference acceleration. Both results beat the exiting quantization works with large margins. We extend our alternating quantization to image classification tasks. In both RNNs and feedforward neural networks, the method also achieves excellent performance.", "keywords": "Alternating Minimization;Quantized Recurrent Neural Network;Binary Search Tree", "primary_area": "", "supplementary_material": "", "author": "Chen Xu;Jianqiang Yao;Zhouchen Lin;Wenwu Ou;Yuanbin Cao;Zhirong Wang;Hongbin Zha", "authorids": "xuen@pku.edu.cn;tianduo@taobao.com;zlin@pku.edu.cn;santong.oww@taobao.com;lingzun.cyb@alibaba-inc.com;qingfeng@taobao.com;zha@cis.pku.edu.cn", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nxu2018alternating,\ntitle={Alternating Multi-bit Quantization for Recurrent Neural Networks},\nauthor={Chen Xu and Jianqiang Yao and Zhouchen Lin and Wenwu Ou and Yuanbin Cao and Zhirong Wang and Hongbin Zha},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=S19dR9x0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;2;4", "rating_avg": 7.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 16, "authors#_avg": 7, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 138, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=182803485359633462&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=S19dR9x0b", "pdf": "https://openreview.net/pdf?id=S19dR9x0b", "email": ";;;;;;", "author_num": 7 }, { "title": "Maximum a Posteriori Policy Optimisation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/9", "id": "S1ANxQW0b", "author_site": "abbas abdolmaleki, Jost Tobias Springenberg, Nicolas Heess, Yuval Tassa, Remi Munos", "tldr": "", "abstract": "We introduce a new algorithm for reinforcement learning called Maximum a-posteriori Policy Optimisation (MPO) based on coordinate ascent on a relative-entropy objective. We show that several existing methods can directly be related to our derivation. We develop two off-policy algorithms and demonstrate that they are competitive with the state-of-the-art in deep reinforcement learning. In particular, for continuous control, our method outperforms existing methods with respect to sample efficiency, premature convergence and robustness to hyperparameter settings.", "keywords": "Reinforcement Learning;Variational Inference;Control", "primary_area": "", "supplementary_material": "", "author": "Abbas Abdolmaleki;Jost Tobias Springenberg;Yuval Tassa;Remi Munos;Nicolas Heess;Martin Riedmiller", "authorids": "abbas.abdolmaleky@gmail.com;springenberg@google.com;;;;", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nabdolmaleki2018maximum,\ntitle={Maximum a Posteriori Policy Optimisation},\nauthor={Abbas Abdolmaleki and Jost Tobias Springenberg and Yuval Tassa and Remi Munos and Nicolas Heess and Martin Riedmiller},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=S1ANxQW0b},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=S1ANxQW0b)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;1;5", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 15, "authors#_avg": 6, "corr_rating_confidence": 0.24019223070763066, "gs_citation": 580, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14521646117118037069&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=S1ANxQW0b", "pdf": "https://openreview.net/pdf?id=S1ANxQW0b", "email": ";;;;;", "author_num": 6 }, { "id": "S1Auv-WRZ", "title": "Data Augmentation Generative Adversarial Networks", "track": "main", "status": "Workshop", "tldr": "Conditional GANs trained to generate data augmented samples of their conditional inputs used to enhance vanilla classification and one shot learning systems such as matching networks and pixel distance", "abstract": "Effective training of neural networks requires much data. In the low-data regime,\nparameters are underdetermined, and learnt networks generalise poorly. Data\nAugmentation (Krizhevsky et al., 2012) alleviates this by using existing data\nmore effectively. However standard data augmentation produces only limited\nplausible alternative data. Given there is potential to generate a much broader set\nof augmentations, we design and train a generative model to do data augmentation.\nThe model, based on image conditional Generative Adversarial Networks, takes\ndata from a source domain and learns to take any data item and generalise it\nto generate other within-class data items. As this generative process does not\ndepend on the classes themselves, it can be applied to novel unseen classes of data.\nWe show that a Data Augmentation Generative Adversarial Network (DAGAN)\naugments standard vanilla classifiers well. We also show a DAGAN can enhance\nfew-shot learning systems such as Matching Networks. We demonstrate these\napproaches on Omniglot, on EMNIST having learnt the DAGAN on Omniglot, and\nVGG-Face data. In our experiments we can see over 13% increase in accuracy in\nthe low-data regime experiments in Omniglot (from 69% to 82%), EMNIST (73.9%\nto 76%) and VGG-Face (4.5% to 12%); in Matching Networks for Omniglot we\nobserve an increase of 0.5% (from 96.9% to 97.4%) and an increase of 1.8% in\nEMNIST (from 59.5% to 61.3%).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anthreas Antoniou;Amos Storkey;Harrison Edwards", "authorids": "a.antoniou@sms.ed.ac.uk;a.storkey@ed.ac.uk;h.l.edwards@sms.ed.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nantoniou2018data,\ntitle={Data Augmentation Generative Adversarial Networks},\nauthor={Anthreas Antoniou and Amos Storkey and Harrison Edwards},\nyear={2018},\nurl={https://openreview.net/forum?id=S1Auv-WRZ},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 7 community implementations](https://paperswithcode.com/paper/?openreview=S1Auv-WRZ)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=S1Auv-WRZ", "pdf_size": 0, "rating": "4;6;9", "confidence": "4;3;5", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.5960395606792698, "gs_citation": 1531, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9773523094742397382&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "S1Azqvy1G", "title": "Pseudo sequence based deep neural network compression", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Along with the performance increase of the neural network, both the number of layers and the number of parameters in each layer are becoming larger and larger. Therefore, there are more and more works trying to compress the neural network efficiently while keeping the performance. However, all of them have not taken the similarity among the kernels into consideration. In this paper, we try to organize the kernels in different channels into a frame and encode the frames using block-based video coding methods. First, we try to reshape the weights in different channels into a pseudo sequence. Second, after obtaining all the frames in the videos, we will convert the weight into the PCA domain to obtain a more compact representation. Then both the intra prediction and the inter prediction will be performed in the PCA domain to achieve better performance. Finally, the uniform quantization and entropy coding will be used to encode the residue blocks. The experimental results show that we can achieve $58$ times compression for the classical VGG-16 model. Not only with very high compression ratio, the proposed method can also provide the benefits of getting a better balance between the bits per weight and the error in more precise granularity by adjusting the quantization parameters.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anonymous", "authorids": "ICLR.cc/2018/Conference/Paper586/Authors", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@article{\n anonymous2018pseudo,\n title={Pseudo sequence based deep neural network compression},\n author={Anonymous},\n journal={International Conference on Learning Representations},\n year={2018},\n url={https://openreview.net/forum?id=ByUsSgb0-}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=S1Azqvy1G", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 0, "authors#_avg": 1, "corr_rating_confidence": 0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "Ask the Right Questions: Active Question Reformulation with Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/107", "id": "S1CChZ-CZ", "author_site": "Christian Buck, Jannis Bulian, Massimiliano Ciaramita, Wojciech Gajewski, Andrea Gesmundo, Neil Houlsby, Wei Wang.", "tldr": "We propose an agent that sits between the user and a black box question-answering system and which learns to reformulate questions to elicit the best possible answers", "abstract": "We frame Question Answering (QA) as a Reinforcement Learning task, an approach that we call Active Question Answering. \n\nWe propose an agent that sits between the user and a black box QA system and learns to reformulate questions to elicit the best possible answers. The agent probes the system with, potentially many, natural language reformulations of an initial question and aggregates the returned evidence to yield the best answer. \n\nThe reformulation system is trained end-to-end to maximize answer quality using policy gradient. We evaluate on SearchQA, a dataset of complex questions extracted from Jeopardy!. The agent outperforms a state-of-the-art base model, playing the role of the environment, and other benchmarks.\n\nWe also analyze the language that the agent has learned while interacting with the question answering system. We find that successful question reformulations look quite different from natural language paraphrases. The agent is able to discover non-trivial reformulation strategies that resemble classic information retrieval techniques such as term re-weighting (tf-idf) and stemming.", "keywords": "machine translation;paraphrasing;question answering;reinforcement learning;agents", "primary_area": "", "supplementary_material": "", "author": "Christian Buck;Jannis Bulian;Massimiliano Ciaramita;Wojciech Gajewski;Andrea Gesmundo;Neil Houlsby;Wei Wang.", "authorids": "cbuck@google.com;jbulian@google.com;massi@google.com;wgaj@google.com;agesmundo@google.com;neilhoulsby@google.com;wangwe@google.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nbuck2018ask,\ntitle={Ask the Right Questions: Active Question Reformulation with Reinforcement Learning},\nauthor={Christian Buck and Jannis Bulian and Massimiliano Ciaramita and Wojciech Gajewski and Andrea Gesmundo and Neil Houlsby and Wei Wang.},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=S1CChZ-CZ},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=S1CChZ-CZ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;5;3", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 7, "corr_rating_confidence": -0.5, "gs_citation": 190, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4524039328183548455&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=S1CChZ-CZ", "pdf": "https://openreview.net/pdf?id=S1CChZ-CZ", "email": ";;;;;;", "author_num": 7 }, { "title": "Viterbi-based Pruning for Sparse Matrix with Fixed and High Index Compression Ratio", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/256", "id": "S1D8MPxA-", "author_site": "Dongsoo Lee, Daehyun Ahn, Taesu Kim, Pierce I Chuang, Jae-Joon Kim", "tldr": "We present a new pruning method and sparse matrix format to enable high index compression ratio and parallel index decoding process.", "abstract": "Weight pruning has proven to be an effective method in reducing the model size and computation cost while not sacrificing the model accuracy. Conventional sparse matrix formats, however, involve irregular index structures with large storage requirement and sequential reconstruction process, resulting in inefficient use of highly parallel computing resources. Hence, pruning is usually restricted to inference with a batch size of one, for which an efficient parallel matrix-vector multiplication method exists. In this paper, a new class of sparse matrix representation utilizing Viterbi algorithm that has a high, and more importantly, fixed index compression ratio regardless of the pruning rate, is proposed. In this approach, numerous sparse matrix candidates are first generated by the Viterbi encoder, and then the one that aims to minimize the model accuracy degradation is selected by the Viterbi algorithm. The model pruning process based on the proposed Viterbi encoder and Viterbi algorithm is highly parallelizable, and can be implemented efficiently in hardware to achieve low-energy, high-performance index decoding process. Compared with the existing magnitude-based pruning methods, index data storage requirement can be further compressed by 85.2% in MNIST and 83.9% in AlexNet while achieving similar pruning rate. Even compared with the relative index compression technique, our method can still reduce the index storage requirement by 52.7% in MNIST and 35.5% in AlexNet.", "keywords": "pruning;sparse matrix;memory footprint;model size;model compression", "primary_area": "", "supplementary_material": "", "author": "Dongsoo Lee;Daehyun Ahn;Taesu Kim;Pierce I. Chuang;Jae-Joon Kim", "authorids": "dslee3@gmail.com;daehyun.ahn@postech.ac.kr;taesukim@postech.ac.kr;pchuang@us.ibm.com;jaejoon@postech.ac.kr", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nlee2018viterbibased,\ntitle={Viterbi-based Pruning for Sparse Matrix with Fixed and High Index Compression Ratio},\nauthor={Dongsoo Lee and Daehyun Ahn and Taesu Kim and Pierce I. Chuang and Jae-Joon Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=S1D8MPxA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 5, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8920021848200630175&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=S1D8MPxA-", "pdf": "https://openreview.net/pdf?id=S1D8MPxA-", "email": ";;;;", "author_num": 5 }, { "title": "Unsupervised Learning of Goal Spaces for Intrinsically Motivated Goal Exploration", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/309", "id": "S1DWPP1A-", "author_site": "Alexandre P\u00e9r\u00e9, S\u00e9bastien Forestier, Olivier Sigaud, Pierre-Yves Oudeyer", "tldr": "We propose a novel Intrinsically Motivated Goal Exploration architecture with unsupervised learning of goal space representations, and evaluate how various implementations enable the discovery of a diversity of policies.", "abstract": "Intrinsically motivated goal exploration algorithms enable machines to discover repertoires of policies that produce a diversity of effects in complex environments. These exploration algorithms have been shown to allow real world robots to acquire skills such as tool use in high-dimensional continuous state and action spaces. However, they have so far assumed that self-generated goals are sampled in a specifically engineered feature space, limiting their autonomy. In this work, we propose an approach using deep representation learning algorithms to learn an adequate goal space. This is a developmental 2-stage approach: first, in a perceptual learning stage, deep learning algorithms use passive raw sensor observations of world changes to learn a corresponding latent space; then goal exploration happens in a second stage by sampling goals in this latent space. We present experiments with a simulated robot arm interacting with an object, and we show that exploration algorithms using such learned representations can closely match, and even sometimes improve, the performance obtained using engineered representations.", "keywords": "exploration; autonomous goal setting; diversity; unsupervised learning; deep neural network", "primary_area": "", "supplementary_material": "", "author": "Alexandre P\u00e9r\u00e9;S\u00e9bastien Forestier;Olivier Sigaud;Pierre-Yves Oudeyer", "authorids": "alexandre.pere@inria.fr;sebastien.forestier@inria.fr;olivier.sigaud@upmc.fr;pierre-yves.oudeyer@inria.fr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\np\u00e9r\u00e92018unsupervised,\ntitle={Unsupervised Learning of Goal Spaces for Intrinsically Motivated Goal Exploration},\nauthor={Alexandre P\u00e9r\u00e9 and S\u00e9bastien Forestier and Olivier Sigaud and Pierre-Yves Oudeyer},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=S1DWPP1A-},\n}", "github": "[![github](/images/github_icon.svg) flowersteam/Unsupervised_Goal_Space_Learning](https://github.com/flowersteam/Unsupervised_Goal_Space_Learning)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "2;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 1.0, "gs_citation": 112, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17844977813077230695&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=S1DWPP1A-", "pdf": "https://openreview.net/pdf?id=S1DWPP1A-", "email": ";;;", "author_num": 4 }, { "title": "Fix your classifier: the marginal value of training the last weight layer", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/215", "id": "S1Dh8Tg0-", "author_site": "Elad Hoffer, Itay Hubara, Daniel Soudry", "tldr": "You can fix the classifier in neural networks without losing accuracy", "abstract": "Neural networks are commonly used as models for classification for a wide variety of tasks. Typically, a learned affine transformation is placed at the end of such models, yielding a per-class value used for classification. This classifier can have a vast number of parameters, which grows linearly with the number of possible classes, thus requiring increasingly more resources.\n\nIn this work we argue that this classifier can be fixed, up to a global scale constant, with little or no loss of accuracy for most tasks, allowing memory and computational benefits. Moreover, we show that by initializing the classifier with a Hadamard matrix we can speed up inference as well. We discuss the implications for current understanding of neural network models.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Elad Hoffer;Itay Hubara;Daniel Soudry", "authorids": "elad.hoffer@gmail.com;itayhubara@gmail.com;daniel.soudry@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nhoffer2018fix,\ntitle={Fix your classifier: the marginal value of training the last weight layer},\nauthor={Elad Hoffer and Itay Hubara and Daniel Soudry},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=S1Dh8Tg0-},\n}", "github": "[![github](/images/github_icon.svg) eladhoffer/fix_your_classifier](https://github.com/eladhoffer/fix_your_classifier) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=S1Dh8Tg0-)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;5;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 118, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10161515370917941482&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=S1Dh8Tg0-", "pdf": "https://openreview.net/pdf?id=S1Dh8Tg0-", "email": ";;", "author_num": 3 }, { "id": "S1EfylZ0Z", "title": "Anomaly Detection with Generative Adversarial Networks", "track": "main", "status": "Reject", "tldr": "We propose a method for anomaly detection with GANs by searching the generator's latent space for good sample representations.", "abstract": "Many anomaly detection methods exist that perform well on low-dimensional problems however there is a notable lack of effective methods for high-dimensional spaces, such as images. Inspired by recent successes in deep learning we propose a novel approach to anomaly detection using generative adversarial networks. Given a sample under consideration, our method is based on searching for a good representation of that sample in the latent space of the generator; if such a representation is not found, the sample is deemed anomalous. We achieve state-of-the-art performance on standard image benchmark datasets and visual inspection of the most anomalous samples reveals that our method does indeed return anomalies.", "keywords": "Anomaly Detection;Generative Adversarial Networks;Deep Learning;Inverse Problems", "primary_area": "", "supplementary_material": "", "author": "Lucas Deecke;Robert Vandermeulen;Lukas Ruff;Stephan Mandt;Marius Kloft", "authorids": "ldeecke@gmail.com;vandermeulen@cs.uni-kl.de;contact@lukasruff.com;stephan.mandt@disneyresearch.com;kloft@cs.uni-kl.de", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ndeecke2018anomaly,\ntitle={Anomaly Detection with Generative Adversarial Networks},\nauthor={Lucas Deecke and Robert Vandermeulen and Lukas Ruff and Stephan Mandt and Marius Kloft},\nyear={2018},\nurl={https://openreview.net/forum?id=S1EfylZ0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=S1EfylZ0Z", "pdf_size": 0, "rating": "4;4;6", "confidence": "5;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 5, "corr_rating_confidence": -0.4999999999999999, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "Compositional Attention Networks for Machine Reasoning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/59", "id": "S1Euwz-Rb", "author_site": "Drew A. Hudson, Christopher Manning", "tldr": "We present a novel architecture, based on dynamic memory, attention and composition for the task of machine reasoning.", "abstract": "We present Compositional Attention Networks, a novel fully differentiable neural network architecture, designed to facilitate explicit and expressive reasoning. While many types of neural networks are effective at learning and generalizing from massive quantities of data, this model moves away from monolithic black-box architectures towards a design that provides a strong prior for iterative reasoning, enabling it to support explainable and structured learning, as well as generalization from a modest amount of data. The model builds on the great success of existing recurrent cells such as LSTMs: It sequences a single recurrent Memory, Attention, and Control (MAC) cell, and by careful design imposes structural constraints on the operation of each cell and the interactions between them, incorporating explicit control and soft attention mechanisms into their interfaces. We demonstrate the model's strength and robustness on the challenging CLEVR dataset for visual reasoning, achieving a new state-of-the-art 98.9% accuracy, halving the error rate of the previous best model. More importantly, we show that the new model is more computationally efficient, data-efficient, and requires an order of magnitude less time and/or data to achieve good results.", "keywords": "Deep Learning;Reasoning;Memory;Attention;VQA;CLEVR;Recurrent Neural Networks;Module Networks;Compositionality", "primary_area": "", "supplementary_material": "", "author": "Drew A. Hudson;Christopher D. Manning", "authorids": "dorarad@cs.stanford.edu;manning@cs.stanford.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\narad2018compositional,\ntitle={Compositional Attention Networks for Machine Reasoning},\nauthor={Drew Arad Hudson and Christopher D. Manning},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=S1Euwz-Rb},\n}", "github": "[![github](/images/github_icon.svg) stanfordnlp/mac-network](https://github.com/stanfordnlp/mac-network) + [![Papers with Code](/images/pwc_icon.svg) 9 community implementations](https://paperswithcode.com/paper/?openreview=S1Euwz-Rb)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 15, "authors#_avg": 2, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 684, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6263143180991689473&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=S1Euwz-Rb", "pdf": "https://openreview.net/pdf?id=S1Euwz-Rb", "email": ";", "author_num": 2 }, { "id": "S1EwLkW0W", "title": "Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients", "track": "main", "status": "Reject", "tldr": "Analyzing the popular Adam optimizer", "abstract": "The ADAM optimizer is exceedingly popular in the deep learning community. Often it works very well, sometimes it doesn\u2019t. Why? We interpret ADAM as a combination of two aspects: for each weight, the update direction is determined by the sign of the stochastic gradient, whereas the update magnitude is solely determined by an estimate of its relative variance. We disentangle these two aspects and analyze them in isolation, shedding light on ADAM \u2019s inner workings. Transferring the \"variance adaptation\u201d to momentum- SGD gives rise to a novel method, completing the practitioner\u2019s toolbox for problems where ADAM fails.", "keywords": "Stochastic Optimization;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Lukas Balles;Philipp Hennig", "authorids": "lukas.balles@tuebingen.mpg.de;ph@tue.mpg.de", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nballes2018dissecting,\ntitle={Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients},\nauthor={Lukas Balles and Philipp Hennig},\nyear={2018},\nurl={https://openreview.net/forum?id=S1EwLkW0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=S1EwLkW0W", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 203, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7051163857828136426&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6 }, { "id": "S1EzRgb0W", "title": "Explaining the Mistakes of Neural Networks with Latent Sympathetic Examples", "track": "main", "status": "Reject", "tldr": "New way of explaining why a neural network has misclassified an image", "abstract": "Neural networks make mistakes. The reason why a mistake is made often remains a mystery. As such neural networks often are considered a black box. It would be useful to have a method that can give an explanation that is intuitive to a user as to why an image is misclassified. In this paper we develop a method for explaining the mistakes of a classifier model by visually showing what must be added to an image such that it is correctly classified. Our work combines the fields of adversarial examples, generative modeling and a correction technique based on difference target propagation to create an technique that creates explanations of why an image is misclassified. In this paper we explain our method and demonstrate it on MNIST and CelebA. This approach could aid in demystifying neural networks for a user.\n", "keywords": "Deep learning;Adversarial Examples;Difference Target Propagation;Generative Modelling;Classifiers;Explaining;Sympathetic Examples", "primary_area": "", "supplementary_material": "", "author": "Riaan Zoetmulder;Efstratios Gavves;Peter O'Connor", "authorids": "riaan.zoetmulder@student.uva.nl;egavves@uva.nl;peter.ed.oconnor@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzoetmulder2018explaining,\ntitle={Explaining the Mistakes of Neural Networks with Latent Sympathetic Examples},\nauthor={Riaan Zoetmulder and Efstratios Gavves and Peter O'Connor},\nyear={2018},\nurl={https://openreview.net/forum?id=S1EzRgb0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=S1EzRgb0W", "pdf_size": 0, "rating": "4;4;6", "confidence": "5;3;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:sad4TbydGOoJ:scholar.google.com/&scioq=Explaining+the+Mistakes+of+Neural+Networks+with+Latent+Sympathetic+Examples&hl=en&as_sdt=0,14", "gs_version_total": 2 }, { "id": "S1FFLWWCZ", "title": "LSD-Net: Look, Step and Detect for Joint Navigation and Multi-View Recognition with Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Multi-view recognition is the task of classifying an object from multi-view image sequences. Instead of using a single-view for classification, humans generally navigate around a target object to learn its multi-view representation. Motivated by this human behavior, the next best view can be learned by combining object recognition with navigation in complex environments. Since deep reinforcement learning has proven successful in navigation tasks, we propose a novel multi-task reinforcement learning framework for joint multi-view recognition and navigation. Our method uses a hierarchical action space for multi-task reinforcement learning. The framework was evaluated with an environment created from the ModelNet40 dataset. Our results show improvements on object recognition and demonstrate human-like behavior on navigation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "N dinesh reddy", "authorids": "dnarapur@andrew.cmu.edu", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\ndinesh2018lsdnet,\ntitle={{LSD}-Net: Look, Step and Detect for Joint Navigation and Multi-View Recognition with Deep Reinforcement Learning},\nauthor={N dinesh reddy},\nyear={2018},\nurl={https://openreview.net/forum?id=S1FFLWWCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1FFLWWCZ", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 1, "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "S1FQEfZA-", "title": "A Classification-Based Perspective on GAN Distributions", "track": "main", "status": "Reject", "tldr": "We propose new methods for evaluating and quantifying the quality of synthetic GAN distributions from the perspective of classification tasks", "abstract": "A fundamental, and still largely unanswered, question in the context of Generative Adversarial Networks (GANs) is whether GANs are actually able to capture the key characteristics of the datasets they are trained on. The current approaches to examining this issue require significant human supervision, such as visual inspection of sampled images, and often offer only fairly limited scalability. In this paper, we propose new techniques that employ classification-based perspective to evaluate synthetic GAN distributions and their capability to accurately reflect the essential properties of the training data. These techniques require only minimal human supervision and can easily be scaled and adapted to evaluate a variety of state-of-the-art GANs on large, popular datasets. They also indicate that GANs have significant problems in reproducing the more distributional properties of the training dataset. In particular, the diversity of such synthetic data is orders of magnitude smaller than that of the original data.", "keywords": "Generative adversarial networks;classification;benchmark;mode collapse;diversity", "primary_area": "", "supplementary_material": "", "author": "Shibani Santurkar;Ludwig Schmidt;Aleksander Madry", "authorids": "shibani@mit.edu;ludwigs@mit.edu;madry@mit.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsanturkar2018a,\ntitle={A Classification-Based Perspective on {GAN} Distributions},\nauthor={Shibani Santurkar and Ludwig Schmidt and Aleksander Madry},\nyear={2018},\nurl={https://openreview.net/forum?id=S1FQEfZA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1FQEfZA-", "pdf_size": 0, "rating": "3;5;6", "confidence": "4;5;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.18898223650461357, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17597864795238487850&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "S1GDXzb0b", "title": "Model-based imitation learning from state trajectories", "track": "main", "status": "Reject", "tldr": "Learning to imitate an expert in the absence of optimal actions learning a dynamics model while exploring the environment.", "abstract": "Imitation learning from demonstrations usually relies on learning a policy from trajectories of optimal states and actions. However, in real life expert demonstrations, often the action information is missing and only state trajectories are available. We present a model-based imitation learning method that can learn environment-specific optimal actions only from expert state trajectories. Our proposed method starts with a model-free reinforcement learning algorithm with a heuristic reward signal to sample environment dynamics, which is then used to train the state-transition probability. Subsequently, we learn the optimal actions from expert state trajectories by supervised learning, while back-propagating the error gradients through the modeled environment dynamics. Experimental evaluations show that our proposed method successfully achieves performance similar to (state, action) trajectory-based traditional imitation learning methods even in the absence of action information, with much fewer iterations compared to conventional model-free reinforcement learning methods. We also demonstrate that our method can learn to act from only video demonstrations of expert agent for simple games and can learn to achieve desired performance in less number of iterations.", "keywords": "Model based reinforcement learning;Imitation learning;dynamics model", "primary_area": "", "supplementary_material": "", "author": "Subhajit Chaudhury;Daiki Kimura;Tadanobu Inoue;Ryuki Tachibana", "authorids": "subhajit@jp.ibm.com;daiki@jp.ibm.com;inouet@jp.ibm.com;ryuki@jp.ibm.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nchaudhury2018modelbased,\ntitle={Model-based imitation learning from state trajectories},\nauthor={Subhajit Chaudhury and Daiki Kimura and Tadanobu Inoue and Ryuki Tachibana},\nyear={2018},\nurl={https://openreview.net/forum?id=S1GDXzb0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=S1GDXzb0b", "pdf_size": 0, "rating": "3;4;7", "confidence": "5;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": -0.9607689228305228, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:DoGuWI8zCqgJ:scholar.google.com/&scioq=Model-based+imitation+learning+from+state+trajectories&hl=en&as_sdt=0,33", "gs_version_total": 2 }, { "id": "S1GUgxgCW", "title": "Latent Topic Conversational Models", "track": "main", "status": "Reject", "tldr": "Latent Topic Conversational Model, a hybrid of seq2seq and neural topic model to generate more diverse and interesting responses.", "abstract": "Despite much success in many large-scale language tasks, sequence-to-sequence (seq2seq) models have not been an ideal choice for conversational modeling as they tend to generate generic and repetitive responses. In this paper, we propose a Latent Topic Conversational Model (LTCM) that augments the seq2seq model with a neural topic component to better model human-human conversations. The neural topic component encodes information from the source sentence to build a global \u201ctopic\u201d distribution over words, which is then consulted by the seq2seq model to improve generation at each time step. The experimental results show that the proposed LTCM can generate more diverse and interesting responses by sampling from its learnt latent representations. In a subjective human evaluation, the judges also confirm that LTCM is the preferred option comparing to competitive baseline models.\n", "keywords": "conversational modeling;dialogue;chitchat;open-domain dialogue;topic model;neural variational inference;human evaluation;latent variable model;gaussian reparameterisation trick", "primary_area": "", "supplementary_material": "", "author": "Tsung-Hsien Wen;Minh-Thang Luong", "authorids": "thw28@cam.ac.uk;thangluong@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nwen2018latent,\ntitle={Latent Topic Conversational Models},\nauthor={Tsung-Hsien Wen and Minh-Thang Luong},\nyear={2018},\nurl={https://openreview.net/forum?id=S1GUgxgCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=S1GUgxgCW", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;4;4", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8789233136410004685&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "The Kanerva Machine: A Generative Distributed Memory", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/105", "id": "S1HlA-ZAZ", "author_site": "Yan Wu, Greg Wayne, Alex Graves, Timothy Lillicrap", "tldr": "A generative memory model that combines slow-learning neural networks and a fast-adapting linear Gaussian model as memory.", "abstract": "We present an end-to-end trained memory system that quickly adapts to new data and generates samples like them. Inspired by Kanerva's sparse distributed memory, it has a robust distributed reading and writing mechanism. The memory is analytically tractable, which enables optimal on-line compression via a Bayesian update-rule. We formulate it as a hierarchical conditional generative model, where memory provides a rich data-dependent prior distribution. Consequently, the top-down memory and bottom-up perception are combined to produce the code representing an observation. Empirically, we demonstrate that the adaptive memory significantly improves generative models trained on both the Omniglot and CIFAR datasets. Compared with the Differentiable Neural Computer (DNC) and its variants, our memory model has greater capacity and is significantly easier to train.", "keywords": "memory;generative model;inference;neural network;hierarchical model", "primary_area": "", "supplementary_material": "", "author": "Yan Wu;Greg Wayne;Alex Graves;Timothy Lillicrap", "authorids": "yanwu@google.com;gregwayne@google.com;gravesa@google.com;countzero@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nwu2018the,\ntitle={The Kanerva Machine: A Generative Distributed Memory},\nauthor={Yan Wu and Greg Wayne and Alex Graves and Timothy Lillicrap},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=S1HlA-ZAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;3;2", "rating_avg": 6.666666666666667, "confidence_avg": 3.0, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9888262262485457347&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=S1HlA-ZAZ", "pdf": "https://openreview.net/pdf?id=S1HlA-ZAZ", "email": ";;;", "author_num": 4 }, { "title": "Interpretable Counting for Visual Question Answering", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/195", "id": "S1J2ZyZ0Z", "author_site": "Alexander Trott, Caiming Xiong, richard socher", "tldr": "We perform counting for visual question answering; our model produces interpretable outputs by counting directly from detected objects.", "abstract": "Questions that require counting a variety of objects in images remain a major challenge in visual question answering (VQA). The most common approaches to VQA involve either classifying answers based on fixed length representations of both the image and question or summing fractional counts estimated from each section of the image. In contrast, we treat counting as a sequential decision process and force our model to make discrete choices of what to count. Specifically, the model sequentially selects from detected objects and learns interactions between objects that influence subsequent selections. A distinction of our approach is its intuitive and interpretable output, as discrete counts are automatically grounded in the image. Furthermore, our method outperforms the state of the art architecture for VQA on multiple metrics that evaluate counting.", "keywords": "Counting;VQA;Object detection", "primary_area": "", "supplementary_material": "", "author": "Alexander Trott;Caiming Xiong;Richard Socher", "authorids": "atrott@salesforce.com;cxiong@salesforce.com;rsocher@salesforce.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ntrott2018interpretable,\ntitle={Interpretable Counting for Visual Question Answering},\nauthor={Alexander Trott and Caiming Xiong and Richard Socher},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=S1J2ZyZ0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14039637797688698399&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=S1J2ZyZ0Z", "pdf": "https://openreview.net/pdf?id=S1J2ZyZ0Z", "email": ";;", "author_num": 3 }, { "title": "Boosting Dilated Convolutional Networks with Mixed Tensor Decompositions", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/320", "id": "S1JHhv6TW", "author_site": "Nadav Cohen, Ronen Tamari, Amnon Shashua", "tldr": "We introduce the notion of mixed tensor decompositions, and use it to prove that interconnecting dilated convolutional networks boosts their expressive power.", "abstract": "The driving force behind deep networks is their ability to compactly represent rich classes of functions. The primary notion for formally reasoning about this phenomenon is expressive efficiency, which refers to a situation where one network must grow unfeasibly large in order to replicate functions of another. To date, expressive efficiency analyses focused on the architectural feature of depth, showing that deep networks are representationally superior to shallow ones. In this paper we study the expressive efficiency brought forth by connectivity, motivated by the observation that modern networks interconnect their layers in elaborate ways. We focus on dilated convolutional networks, a family of deep models delivering state of the art performance in sequence processing tasks. By introducing and analyzing the concept of mixed tensor decompositions, we prove that interconnecting dilated convolutional networks can lead to expressive efficiency. In particular, we show that even a single connection between intermediate layers can already lead to an almost quadratic gap, which in large-scale settings typically makes the difference between a model that is practical and one that is not. Empirical evaluation demonstrates how the expressive efficiency of connectivity, similarly to that of depth, translates into gains in accuracy. This leads us to believe that expressive efficiency may serve a key role in developing new tools for deep network design.", "keywords": "Deep Learning;Expressive Efficiency;Dilated Convolutions;Tensor Decompositions", "primary_area": "", "supplementary_material": "", "author": "Nadav Cohen;Ronen Tamari;Amnon Shashua", "authorids": "cohennadav@ias.edu;ronent@cs.huji.ac.il;shashua@cs.huji.ac.il", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ncohen2018boosting,\ntitle={Boosting Dilated Convolutional Networks with Mixed Tensor Decompositions},\nauthor={Nadav Cohen and Ronen Tamari and Amnon Shashua},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=S1JHhv6TW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1", "pdf_size": 0, "rating": "7;8;9", "confidence": "4;3;4", "rating_avg": 8.0, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5878589884999737901&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=S1JHhv6TW", "pdf": "https://openreview.net/pdf?id=S1JHhv6TW", "email": ";;", "author_num": 3 }, { "id": "S1LXVnxRb", "title": "Cross-Corpus Training with TreeLSTM for the Extraction of Biomedical Relationships from Text", "track": "main", "status": "Workshop", "tldr": "", "abstract": "A bottleneck problem in machine learning-based relationship extraction (RE) algorithms, and particularly of deep learning-based ones, is the availability of training data in the form of annotated corpora. For specific domains, such as biomedicine, the long time and high expertise required for the development of manually annotated corpora explain that most of the existing one are relatively small (i.e., hundreds of sentences). Beside, larger corpora focusing on general or domain-specific relationships (such as citizenship or drug-drug interactions) have been developed. In this paper, we study how large annotated corpora developed for alternative tasks may improve the performances on biomedicine related tasks, for which few annotated resources are available. We experiment two deep learning-based models to extract relationships from biomedical texts with high performance. The first one combine locally extracted features using a Convolutional Neural Network (CNN) model, while the second exploit the syntactic structure of sentences using a Recursive Neural Network (RNN) architecture. Our experiments show that, contrary to the former, the latter benefits from a cross-corpus learning strategy to improve the performance of relationship extraction tasks. Indeed our approach leads to the best published performances for two biomedical RE tasks, and to state-of-the-art results for two other biomedical RE tasks, for which few annotated resources are available (less than 400 manually annotated sentences). This may be particularly impactful in specialized domains in which training resources are scarce, because they would benefit from the training data of other domains for which large annotated corpora does exist. ", "keywords": "Relationships Extraction;Deep Learning;TreeLSTM;NLP", "primary_area": "", "supplementary_material": "", "author": "Legrand Jo\u00ebl;Yannick Toussaint;Chedy Ra\u00efssi;Adrien Coulet", "authorids": "joel.legrand@loria.fr;yannick.toussaint@loria.fr;chedy.raissi@inria.fr;adrien.coulet@loria.fr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\njo\u00ebl2018crosscorpus,\ntitle={Cross-Corpus Training with Tree{LSTM} for the Extraction of Biomedical Relationships from Text},\nauthor={Legrand Jo\u00ebl and Yannick Toussaint and Chedy Ra\u00efssi and Adrien Coulet},\nyear={2018},\nurl={https://openreview.net/forum?id=S1LXVnxRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=S1LXVnxRb", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;4;4", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9302852459521362742&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "S1NHaMW0b", "title": "ShakeDrop regularization", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper proposes a powerful regularization method named \\textit{ShakeDrop regularization}.\nShakeDrop is inspired by Shake-Shake regularization that decreases error rates by disturbing learning.\nWhile Shake-Shake can be applied to only ResNeXt which has multiple branches, ShakeDrop can be applied to not only ResNeXt but also ResNet, Wide ResNet and PyramidNet in a memory efficient way.\nImportant and interesting feature of ShakeDrop is that it strongly disturbs learning by multiplying even a negative factor to the output of a convolutional layer in the forward training pass.\nThe effectiveness of ShakeDrop is confirmed by experiments on CIFAR-10/100 and Tiny ImageNet datasets.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yoshihiro Yamada;Masakazu Iwamura;Koichi Kise", "authorids": "yamada@m.cs.osakafu-u.ac.jp;masa@cs.osakafu-u.ac.jp;kise@cs.osakafu-u.ac.jp", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nyamada2018shakedrop,\ntitle={ShakeDrop regularization},\nauthor={Yoshihiro Yamada and Masakazu Iwamura and Koichi Kise},\nyear={2018},\nurl={https://openreview.net/forum?id=S1NHaMW0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=S1NHaMW0b", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;3;2", "rating_avg": 4.333333333333333, "confidence_avg": 3.0, "replies_avg": 17, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 75, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13958846050552173562&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "S1Ow_e-Rb", "title": "How do deep convolutional neural networks learn from raw audio waveforms?", "track": "main", "status": "Reject", "tldr": "", "abstract": "Prior work on speech and audio processing has demonstrated the ability to obtain excellent performance when learning directly from raw audio waveforms using convolutional neural networks (CNNs). However, the exact inner workings of a CNN remain unclear, which hinders further developments and improvements into this direction. In this paper, we theoretically analyze and explain how deep CNNs learn from raw audio waveforms and identify potential limitations of existing network structures. Based on this analysis, we further propose a new network architecture (called SimpleNet), which offers a very simple but concise structure and high model interpretability. ", "keywords": "Convolutional neural networks;Audio processing;Speech processing", "primary_area": "", "supplementary_material": "", "author": "Yuan Gong;Christian Poellabauer", "authorids": "ygong1@nd.edu;cpoellab@nd.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ngong2018how,\ntitle={How do deep convolutional neural networks learn from raw audio waveforms?},\nauthor={Yuan Gong and Christian Poellabauer},\nyear={2018},\nurl={https://openreview.net/forum?id=S1Ow_e-Rb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1Ow_e-Rb", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;4;5", "rating_avg": 2.6666666666666665, "confidence_avg": 4.666666666666667, "replies_avg": 11, "authors#_avg": 2, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9486751759109616478&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "S1PWi_lC-", "title": "Multi-task Learning on MNIST Image Datasets", "track": "main", "status": "Reject", "tldr": "multi-task learning works ", "abstract": "We apply multi-task learning to image classification tasks on MNIST-like datasets. MNIST dataset has been referred to as the {\\em drosophila} of machine learning and has been the testbed of many learning theories. The NotMNIST dataset and the FashionMNIST dataset have been created with the MNIST dataset as reference. In this work, we exploit these MNIST-like datasets for multi-task learning. The datasets are pooled together for learning the parameters of joint classification networks. Then the learned parameters are used as the initial parameters to retrain disjoint classification networks. The baseline recognition model are all-convolution neural networks. Without multi-task learning, the recognition accuracies for MNIST, NotMNIST and FashionMNIST are 99.56\\%, 97.22\\% and 94.32\\% respectively. With multi-task learning to pre-train the networks, the recognition accuracies are respectively 99.70\\%, 97.46\\% and 95.25\\%. The results re-affirm that multi-task learning framework, even with data with different genres, does lead to significant improvement.\n", "keywords": "multi-task learning;MNIST;image recognition", "primary_area": "", "supplementary_material": "", "author": "Po-Chen Hsieh;Chia-Ping Chen", "authorids": "st70712@gmail.com;cpchen@cse.nsysu.edu.tw", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nhsieh2018multitask,\ntitle={Multi-task Learning on {MNIST} Image Datasets},\nauthor={Po-Chen Hsieh and Chia-Ping Chen},\nyear={2018},\nurl={https://openreview.net/forum?id=S1PWi_lC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=S1PWi_lC-", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 19, "authors#_avg": 2, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12246419369977353095&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "S1Q79heRW", "title": "Unsupervised Learning of Entailment-Vector Word Embeddings", "track": "main", "status": "Reject", "tldr": "We train word embeddings based on entailment instead of similarity, successfully predicting lexical entailment.", "abstract": "Entailment vectors are a principled way to encode in a vector what information is known and what is unknown. They are designed to model relations where one vector should include all the information in another vector, called entailment. This paper investigates the unsupervised learning of entailment vectors for the semantics of words. Using simple entailment-based models of the semantics of words in text (distributional semantics), we induce entailment-vector word embeddings which outperform the best previous results for predicting entailment between words, in unsupervised and semi-supervised experiments on hyponymy.\n", "keywords": "word embeddings;natural language semantics;entailment;unsupervised learning;distributional semantics", "primary_area": "", "supplementary_material": "", "author": "James Henderson", "authorids": "james.henderson@idiap.ch", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nhenderson2018unsupervised,\ntitle={Unsupervised Learning of Entailment-Vector Word Embeddings},\nauthor={James Henderson},\nyear={2018},\nurl={https://openreview.net/forum?id=S1Q79heRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=S1Q79heRW", "pdf_size": 0, "rating": "3;3;7", "confidence": "5;5;3", "rating_avg": 4.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 1, "corr_rating_confidence": -1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4sqE0jxSmTIJ:scholar.google.com/&scioq=Unsupervised+Learning+of+Entailment-Vector+Word+Embeddings&hl=en&as_sdt=0,33", "gs_version_total": 2 }, { "id": "S1TgE7WR-", "title": "Covariant Compositional Networks For Learning Graphs", "track": "main", "status": "Workshop", "tldr": "A general framework for creating covariant graph neural networks", "abstract": "Most existing neural networks for learning graphs deal with the issue of permutation invariance by conceiving of the network as a message passing scheme, where each node sums the feature vectors coming from its neighbors. We argue that this imposes a limitation on their representation power, and instead propose a new general architecture for representing objects consisting of a hierarchy of parts, which we call Covariant Compositional Networks (CCNs). Here covariance means that the activation of each neuron must transform in a specific way under permutations, similarly to steerability in CNNs. We achieve covariance by making each activation transform according to a tensor representation of the permutation group, and derive the corresponding tensor aggregation rules that each neuron must implement. Experiments show that CCNs can outperform competing methods on some standard graph learning benchmarks. ", "keywords": "graph neural networks;message passing;label propagation;high order representation", "primary_area": "", "supplementary_material": "", "author": "Risi Kondor;Truong Son Hy;Horace Pan;Brandon M. Anderson;Shubhendu Trivedi", "authorids": "risi@cs.uchicago.edu;hytruongson@uchicago.edu;hopan@cs.uchicago.edu;brandona@uchicago.edu;shubhendu@ttic.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nkondor2018covariant,\ntitle={Covariant Compositional Networks For Learning Graphs},\nauthor={Risi Kondor and Truong Son Hy and Horace Pan and Brandon M. Anderson and Shubhendu Trivedi},\nyear={2018},\nurl={https://openreview.net/forum?id=S1TgE7WR-},\n}", "github": "[![github](/images/github_icon.svg) HyTruongSon/GraphFlow](https://github.com/HyTruongSon/GraphFlow) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=S1TgE7WR-)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=S1TgE7WR-", "pdf_size": 0, "rating": "5;5;6", "confidence": "2;3;3", "rating_avg": 5.333333333333333, "confidence_avg": 2.6666666666666665, "replies_avg": 16, "authors#_avg": 5, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 157, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4275267252416864115&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "title": "Expressive power of recurrent neural networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/112", "id": "S1WRibb0Z", "author_site": "Valentin Khrulkov, Alexander Novikov, Ivan Oseledets", "tldr": "We prove the exponential efficiency of recurrent-type neural networks over shallow networks.", "abstract": "Deep neural networks are surprisingly efficient at solving practical tasks,\nbut the theory behind this phenomenon is only starting to catch up with\nthe practice. Numerous works show that depth is the key to this efficiency.\nA certain class of deep convolutional networks \u2013 namely those that correspond\nto the Hierarchical Tucker (HT) tensor decomposition \u2013 has been\nproven to have exponentially higher expressive power than shallow networks.\nI.e. a shallow network of exponential width is required to realize\nthe same score function as computed by the deep architecture. In this paper,\nwe prove the expressive power theorem (an exponential lower bound on\nthe width of the equivalent shallow network) for a class of recurrent neural\nnetworks \u2013 ones that correspond to the Tensor Train (TT) decomposition.\nThis means that even processing an image patch by patch with an RNN\ncan be exponentially more efficient than a (shallow) convolutional network\nwith one hidden layer. Using theoretical results on the relation between\nthe tensor decompositions we compare expressive powers of the HT- and\nTT-Networks. We also implement the recurrent TT-Networks and provide\nnumerical evidence of their expressivity.", "keywords": "Recurrent Neural Networks;Tensor Train;tensor decompositions;expressive power", "primary_area": "", "supplementary_material": "", "author": "Valentin Khrulkov;Alexander Novikov;Ivan Oseledets", "authorids": "khrulkov.v@gmail.com;sasha.v.novikov@gmail.com;i.oseledets@skoltech.ru", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nkhrulkov2018expressive,\ntitle={Expressive power of recurrent neural networks},\nauthor={Valentin Khrulkov and Alexander Novikov and Ivan Oseledets},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=S1WRibb0Z},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=S1WRibb0Z)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;6", "confidence": "5;3;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 138, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4175617605601726551&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=S1WRibb0Z", "pdf": "https://openreview.net/pdf?id=S1WRibb0Z", "email": ";;", "author_num": 3 }, { "id": "S1XXq6lRW", "title": "Zero-shot Cross Language Text Classification", "track": "main", "status": "Reject", "tldr": "Cross Language Text Classification by universal encoding", "abstract": "Labeled text classification datasets are typically only available in a few select languages. In order to train a model for e.g news categorization in a language $L_t$ without a suitable text classification dataset there are two options. The first option is to create a new labeled dataset by hand, and the second option is to transfer label information from an existing labeled dataset in a source language $L_s$ to the target language $L_t$. In this paper we propose a method for sharing label information across languages by means of a language independent text encoder. The encoder will give almost identical representations to multilingual versions of the same text. This means that labeled data in one language can be used to train a classifier that works for the rest of the languages. The encoder is trained independently of any concrete classification task and can therefore subsequently be used for any classification task. We show that it is possible to obtain good performance even in the case where only a comparable corpus of texts is available. ", "keywords": "Cross Language Text Classification;Neural Networks;Machine Learning", "primary_area": "", "supplementary_material": "", "author": "Dan Svenstrup;Jonas Meinertz Hansen;Ole Winther", "authorids": "dsve@dtu.dk;jonas@meinertz.org;olwi@dtu.dk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsvenstrup2018zeroshot,\ntitle={Zero-shot Cross Language Text Classification},\nauthor={Dan Svenstrup and Jonas Meinertz Hansen and Ole Winther},\nyear={2018},\nurl={https://openreview.net/forum?id=S1XXq6lRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=S1XXq6lRW", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;4;3", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7767076476417896155&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Model compression via distillation and quantization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/8", "id": "S1XolQbRW", "author_site": "Antonio Polino, Razvan Pascanu, Dan Alistarh", "tldr": "Obtains state-of-the-art accuracy for quantized, shallow nets by leveraging distillation. ", "abstract": "Deep neural networks (DNNs) continue to make significant advances, solving tasks from image classification to translation or reinforcement learning. One aspect of the field receiving considerable attention is efficiently executing deep models in resource-constrained environments, such as mobile or embedded devices. This paper focuses on this problem, and proposes two new compression methods, which jointly leverage weight quantization and distillation of larger teacher networks into smaller student networks. The first method we propose is called quantized distillation and leverages distillation during the training process, by incorporating distillation loss, expressed with respect to the teacher, into the training of a student network whose weights are quantized to a limited set of levels. The second method, differentiable quantization, optimizes the location of quantization points through stochastic gradient descent, to better fit the behavior of the teacher model. We validate both methods through experiments on convolutional and recurrent architectures. We show that quantized shallow students can reach similar accuracy levels to full-precision teacher models, while providing order of magnitude compression, and inference speedup that is linear in the depth reduction. In sum, our results enable DNNs for resource-constrained environments to leverage architecture and accuracy advances developed on more powerful devices.\n", "keywords": "quantization;distillation;model compression", "primary_area": "", "supplementary_material": "", "author": "Antonio Polino;Razvan Pascanu;Dan Alistarh", "authorids": "antonio.polino1@gmail.com;razp@google.com;d.alistarh@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\npolino2018model,\ntitle={Model compression via distillation and quantization},\nauthor={Antonio Polino and Razvan Pascanu and Dan Alistarh},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=S1XolQbRW},\n}", "github": "[![github](/images/github_icon.svg) antspy/quantized_distillation](https://github.com/antspy/quantized_distillation) + [![Papers with Code](/images/pwc_icon.svg) 4 community implementations](https://paperswithcode.com/paper/?openreview=S1XolQbRW)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;2;5", "rating_avg": 7.0, "confidence_avg": 3.6666666666666665, "replies_avg": 20, "authors#_avg": 3, "corr_rating_confidence": 0.3273268353539886, "gs_citation": 960, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9862176539747361028&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=S1XolQbRW", "pdf": "https://openreview.net/pdf?id=S1XolQbRW", "email": ";;", "author_num": 3 }, { "id": "S1Y7OOlRZ", "title": "Massively Parallel Hyperparameter Tuning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Modern machine learning models are characterized by large hyperparameter search spaces and prohibitively expensive training costs. For such models, we cannot afford to train candidate models sequentially and wait months before finding a suitable hyperparameter configuration. Hence, we introduce the large-scale regime for parallel hyperparameter tuning, where we need to evaluate orders of magnitude more configurations than available parallel workers in a small multiple of the wall-clock time needed to train a single model. We propose a novel hyperparameter tuning algorithm for this setting that exploits both parallelism and aggressive early-stopping techniques, building on the insights of the Hyperband algorithm. Finally, we conduct a thorough empirical study of our algorithm on several benchmarks, including large-scale experiments with up to 500 workers. Our results show that our proposed algorithm finds good hyperparameter settings nearly an order of magnitude faster than random search.", "keywords": "parallel hyperparameter tuning;deep learning", "primary_area": "", "supplementary_material": "", "author": "Lisha Li;Kevin Jamieson;Afshin Rostamizadeh;Katya Gonina;Moritz Hardt;Benjamin Recht;Ameet Talwalkar", "authorids": "lishal@cs.ucla.edu;jamieson@cs.washington.edu;rostami@google.com;kgonina@google.com;hardt@berkeley.edu;brecht@berkeley.edu;talwalkar@cmu.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nli2018massively,\ntitle={Massively Parallel Hyperparameter Tuning},\nauthor={Lisha Li and Kevin Jamieson and Afshin Rostamizadeh and Katya Gonina and Moritz Hardt and Benjamin Recht and Ameet Talwalkar},\nyear={2018},\nurl={https://openreview.net/forum?id=S1Y7OOlRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=S1Y7OOlRZ", "pdf_size": 0, "rating": "5;5;6", "confidence": "5;5;3", "rating_avg": 5.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 9, "authors#_avg": 7, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 212, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17243713618182045142&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "title": "WHAI: Weibull Hybrid Autoencoding Inference for Deep Topic Modeling", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/42", "id": "S1cZsf-RW", "author_site": "Hao Zhang, Bo Chen, Dandan Guo, Mingyuan Zhou", "tldr": "", "abstract": "To train an inference network jointly with a deep generative topic model, making it both scalable to big corpora and fast in out-of-sample prediction, we develop Weibull hybrid autoencoding inference (WHAI) for deep latent Dirichlet allocation, which infers posterior samples via a hybrid of stochastic-gradient MCMC and autoencoding variational Bayes. The generative network of WHAI has a hierarchy of gamma distributions, while the inference network of WHAI is a Weibull upward-downward variational autoencoder, which integrates a deterministic-upward deep neural network, and a stochastic-downward deep generative model based on a hierarchy of Weibull distributions. The Weibull distribution can be used to well approximate a gamma distribution with an analytic Kullback-Leibler divergence, and has a simple reparameterization via the uniform noise, which help efficiently compute the gradients of the evidence lower bound with respect to the parameters of the inference network. The effectiveness and efficiency of WHAI are illustrated with experiments on big corpora.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hao Zhang;Bo Chen;Dandan Guo;Mingyuan Zhou", "authorids": "zhanghao_xidian@163.com;bchen@mail.xidian.edu.cn;gdd_xidian@126.com;mzhou@utexas.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nzhang2018whai,\ntitle={{WHAI}: Weibull Hybrid Autoencoding Inference for Deep Topic Modeling},\nauthor={Hao Zhang and Bo Chen and Dandan Guo and Mingyuan Zhou},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=S1cZsf-RW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;2", "rating_avg": 5.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 126, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8040427077585946351&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=S1cZsf-RW", "pdf": "https://openreview.net/pdf?id=S1cZsf-RW", "email": ";;;", "author_num": 4 }, { "id": "S1fHmlbCW", "title": "Neural Networks for irregularly observed continuous-time Stochastic Processes", "track": "main", "status": "Reject", "tldr": "Neural architectures providing representations of irregularly observed signals that provably enable signal reconstruction.", "abstract": "Designing neural networks for continuous-time stochastic processes is challenging, especially when observations are made irregularly. In this article, we analyze neural networks from a frame theoretic perspective to identify the sufficient conditions that enable smoothly recoverable representations of signals in L^2(R). Moreover, we show that, under certain assumptions, these properties hold even when signals are irregularly observed. As we converge to the family of (convolutional) neural networks that satisfy these conditions, we show that we can optimize our convolution filters while constraining them so that they effectively compute a Discrete Wavelet Transform. Such a neural network can efficiently divide the time-axis of a signal into orthogonal sub-spaces of different temporal scale and localization. We evaluate the resulting neural network on an assortment of synthetic and real-world tasks: parsimonious auto-encoding, video classification, and financial forecasting.", "keywords": "Deep Learning;Stochastic Processes;Time Series Analysis", "primary_area": "", "supplementary_material": "", "author": "Francois W. Belletti;Alexander Ku;Joseph E. Gonzalez", "authorids": "francois.belletti@berkeley.edu;alexku@berkeley.edu;jegonzal@berkeley.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nw.2018neural,\ntitle={Neural Networks for irregularly observed continuous-time Stochastic Processes},\nauthor={Francois W. Belletti and Alexander Ku and Joseph E. Gonzalez},\nyear={2018},\nurl={https://openreview.net/forum?id=S1fHmlbCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=S1fHmlbCW", "pdf_size": 0, "rating": "2;5;5", "confidence": "3;5;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844387, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14659906822919634783&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "S1fcY-Z0-", "title": "Bayesian Hypernetworks", "track": "main", "status": "Reject", "tldr": "We propose Bayesian hypernetworks: a framework for approximate Bayesian inference in neural networks.", "abstract": "We propose Bayesian hypernetworks: a framework for approximate Bayesian inference in neural networks. A Bayesian hypernetwork, h, is a neural network which learns to transform a simple noise distribution, p(e) = N(0,I), to a distribution q(t) := q(h(e)) over the parameters t of another neural network (the ``primary network). We train q with variational inference, using an invertible h to enable efficient estimation of the variational lower bound on the posterior p(t | D) via sampling. In contrast to most methods for Bayesian deep learning, Bayesian hypernets can represent a complex multimodal approximate posterior with correlations between parameters, while enabling cheap iid sampling of q(t). In practice, Bayesian hypernets provide a better defense against adversarial examples than dropout, and also exhibit competitive performance on a suite of tasks which evaluate model uncertainty, including regularization, active learning, and anomaly detection.\n", "keywords": "variational inference;bayesian inference;deep networks", "primary_area": "", "supplementary_material": "", "author": "David Krueger;Chin-Wei Huang;Riashat Islam;Ryan Turner;Alexandre Lacoste;Aaron Courville", "authorids": "david.scott.krueger@gmail.com;chin-wei.huang@umontreal.ca;riashat.islam@mail.mcgill.ca;turnerry@iro.umontreal.ca;allac@elementai.com;aaron.courville@gmail.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nkrueger2018bayesian,\ntitle={Bayesian Hypernetworks},\nauthor={David Krueger and Chin-Wei Huang and Riashat Islam and Ryan Turner and Alexandre Lacoste and Aaron Courville},\nyear={2018},\nurl={https://openreview.net/forum?id=S1fcY-Z0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=S1fcY-Z0-", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 17, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 202, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4384389698572218001&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "S1fduCl0b", "title": "Lifelong Generative Modeling", "track": "main", "status": "Reject", "tldr": "Lifelong distributional learning through a student-teacher architecture coupled with a cross model posterior regularizer.", "abstract": "Lifelong learning is the problem of learning multiple consecutive tasks in a sequential manner where knowledge gained from previous tasks is retained and used for future learning. It is essential towards the development of intelligent machines that can adapt to their surroundings. In this work we focus on a lifelong learning approach to generative modeling where we continuously incorporate newly observed streaming distributions into our learnt model. We do so through a student-teacher architecture which allows us to learn and preserve all the distributions seen so far without the need to retain the past data nor the past models. Through the introduction of a novel cross-model regularizer, the student model leverages the information learnt by the teacher, which acts as a summary of everything seen till now. The regularizer has the additional benefit of reducing the effect of catastrophic interference that appears when we learn over streaming data. We demonstrate its efficacy on streaming distributions as well as its ability to learn a common latent representation across a complex transfer learning scenario.\n", "keywords": "Lifelong;Generative Modeling;Variational Autoencoder;VAE;Catastrophic Interference", "primary_area": "", "supplementary_material": "", "author": "Jason Ramapuram;Magda Gregorova;Alexandros Kalousis", "authorids": "jason.ramapuram@etu.unige.ch;magda.gregorova@unige.ch;alexandros.kalousis@hesge.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nramapuram2018lifelong,\ntitle={Lifelong Generative Modeling},\nauthor={Jason Ramapuram and Magda Gregorova and Alexandros Kalousis},\nyear={2018},\nurl={https://openreview.net/forum?id=S1fduCl0b},\n}", "github": "[![github](/images/github_icon.svg) jramapuram/LifelongVAE_pytorch](https://github.com/jramapuram/LifelongVAE_pytorch)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=S1fduCl0b", "pdf_size": 0, "rating": "4;4;9", "confidence": "2;5;5", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.5000000000000001, "gs_citation": 152, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2447311119178768425&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "title": "Depthwise Separable Convolutions for Neural Machine Translation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/247", "id": "S1jBcueAb", "author_site": "Lukasz Kaiser, Aidan Gomez, Francois Chollet", "tldr": "Depthwise separable convolutions improve neural machine translation: the more separable the better.", "abstract": "Depthwise separable convolutions reduce the number of parameters and computation used in convolutional operations while increasing representational efficiency.\nThey have been shown to be successful in image classification models, both in obtaining better models than previously possible for a given parameter count (the Xception architecture) and considerably reducing the number of parameters required to perform at a given level (the MobileNets family of architectures). Recently, convolutional sequence-to-sequence networks have been applied to machine translation tasks with good results. In this work, we study how depthwise separable convolutions can be applied to neural machine translation. We introduce a new architecture inspired by Xception and ByteNet, called SliceNet, which enables a significant reduction of the parameter count and amount of computation needed to obtain results like ByteNet, and, with a similar parameter count, achieves better results.\nIn addition to showing that depthwise separable convolutions perform well for machine translation, we investigate the architectural changes that they enable: we observe that thanks to depthwise separability, we can increase the length of convolution windows, removing the need for filter dilation. We also introduce a new super-separable convolution operation that further reduces the number of parameters and computational cost of the models.", "keywords": "convolutions;neural machine translation", "primary_area": "", "supplementary_material": "", "author": "Lukasz Kaiser;Aidan N. Gomez;Francois Chollet", "authorids": "lukaszkaiser@google.com;aidan.n.gomez@gmail.com;fchollet@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nkaiser2018depthwise,\ntitle={Depthwise Separable Convolutions for Neural Machine Translation},\nauthor={Lukasz Kaiser and Aidan N. Gomez and Francois Chollet},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=S1jBcueAb},\n}", "github": "[![github](/images/github_icon.svg) tensorflow/tensor2tensor](https://github.com/tensorflow/tensor2tensor) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=S1jBcueAb)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "5;7;7", "confidence": "4;3;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 416, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7520360878420709403&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=S1jBcueAb", "pdf": "https://openreview.net/pdf?id=S1jBcueAb", "email": ";;", "author_num": 3 }, { "id": "S1lN69AT-", "title": "To Prune, or Not to Prune: Exploring the Efficacy of Pruning for Model Compression", "track": "main", "status": "Workshop", "tldr": "We demonstrate that large, but pruned models (large-sparse) outperform their smaller, but dense (small-dense) counterparts with identical memory footprint.", "abstract": "Model pruning seeks to induce sparsity in a deep neural network's various connection matrices, thereby reducing the number of nonzero-valued parameters in the model. Recent reports (Han et al., 2015; Narang et al., 2017) prune deep networks at the cost of only a marginal loss in accuracy and achieve a sizable reduction in model size. This hints at the possibility that the baseline models in these experiments are perhaps severely over-parameterized at the outset and a viable alternative for model compression might be to simply reduce the number of hidden units while maintaining the model's dense connection structure, exposing a similar trade-off in model size and accuracy. We investigate these two distinct paths for model compression within the context of energy-efficient inference in resource-constrained environments and propose a new gradual pruning technique that is simple and straightforward to apply across a variety of models/datasets with minimal tuning and can be seamlessly incorporated within the training process. We compare the accuracy of large, but pruned models (large-sparse) and their smaller, but dense (small-dense) counterparts with identical memory footprint. Across a broad range of neural network architectures (deep CNNs, stacked LSTM, and seq2seq LSTM models), we find large-sparse models to consistently outperform small-dense models and achieve up to 10x reduction in number of non-zero parameters with minimal loss in accuracy.", "keywords": "pruning;model sparsity;model compression;deep learning", "primary_area": "", "supplementary_material": "", "author": "Michael H. Zhu;Suyog Gupta", "authorids": "mhzhu@cs.stanford.edu;suyoggupta@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nh.2018to,\ntitle={To Prune, or Not to Prune: Exploring the Efficacy of Pruning for Model Compression},\nauthor={Michael H. Zhu and Suyog Gupta},\nyear={2018},\nurl={https://openreview.net/forum?id=S1lN69AT-},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 4 community implementations](https://paperswithcode.com/paper/?openreview=S1lN69AT-)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=S1lN69AT-", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;5;4", "rating_avg": 5.0, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 1609, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14305028926417652827&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "S1m6h21Cb", "title": "The Cramer Distance as a Solution to Biased Wasserstein Gradients", "track": "main", "status": "Reject", "tldr": "The Wasserstein distance is hard to minimize with stochastic gradient descent, while the Cramer distance can be optimized easily and works just as well.", "abstract": "The Wasserstein probability metric has received much attention from the machine learning community. Unlike the Kullback-Leibler divergence, which strictly measures change in probability, the Wasserstein metric reflects the underlying geometry between outcomes. The value of being sensitive to this geometry has been demonstrated, among others, in ordinal regression and generative modelling, and most recently in reinforcement learning. In this paper we describe three natural properties of probability divergences that we believe reflect requirements from machine learning: sum invariance, scale sensitivity, and unbiased sample gradients. The Wasserstein metric possesses the first two properties but, unlike the Kullback-Leibler divergence, does not possess the third. We provide empirical evidence suggesting this is a serious issue in practice. Leveraging insights from probabilistic forecasting we propose an alternative to the Wasserstein metric, the Cram\u00e9r distance. We show that the Cram\u00e9r distance possesses all three desired properties, combining the best of the Wasserstein and Kullback-Leibler divergences. We give empirical results on a number of domains comparing these three divergences. To illustrate the practical relevance of the Cram\u00e9r distance we design a new algorithm, the Cram\u00e9r Generative Adversarial Network (GAN), and show that it has a number of desirable properties over the related Wasserstein GAN.\n", "keywords": "Probability metrics;Wasserstein metric;stochastic gradient descent;GANs", "primary_area": "", "supplementary_material": "", "author": "Marc G. Bellemare;Ivo Danihelka;Will Dabney;Shakir Mohamed;Balaji Lakshminarayanan;Stephan Hoyer;Remi Munos", "authorids": ";danihelka@google.com;;;;;", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\ng.2018the,\ntitle={The Cramer Distance as a Solution to Biased Wasserstein Gradients},\nauthor={Marc G. Bellemare and Ivo Danihelka and Will Dabney and Shakir Mohamed and Balaji Lakshminarayanan and Stephan Hoyer and Remi Munos},\nyear={2018},\nurl={https://openreview.net/forum?id=S1m6h21Cb},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=S1m6h21Cb)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=S1m6h21Cb", "pdf_size": 0, "rating": "4;5;7", "confidence": "5;3;2", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 9, "authors#_avg": 7, "corr_rating_confidence": -0.9285714285714286, "gs_citation": 467, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9730474053299423065&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Semantically Decomposing the Latent Spaces of Generative Adversarial Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/281", "id": "S1nQvfgA-", "author_site": "Chris Donahue, Zachary Lipton, Akshay Balsubramani, Julian McAuley", "tldr": "SD-GANs disentangle latent codes according to known commonalities in a dataset (e.g. photographs depicting the same person).", "abstract": "We propose a new algorithm for training generative adversarial networks to jointly learn latent codes for both identities (e.g. individual humans) and observations (e.g. specific photographs). In practice, this means that by fixing the identity portion of latent codes, we can generate diverse images of the same subject, and by fixing the observation portion we can traverse the manifold of subjects while maintaining contingent aspects such as lighting and pose. Our algorithm features a pairwise training scheme in which each sample from the generator consists of two images with a common identity code. Corresponding samples from the real dataset consist of two distinct photographs of the same subject. In order to fool the discriminator, the generator must produce images that are both photorealistic, distinct, and appear to depict the same person. We augment both the DCGAN and BEGAN approaches with Siamese discriminators to accommodate pairwise training. Experiments with human judges and an off-the-shelf face verification system demonstrate our algorithm\u2019s ability to generate convincing, identity-matched photographs.", "keywords": "disentangled representations;generative adversarial networks;generative modeling;image synthesis", "primary_area": "", "supplementary_material": "", "author": "Chris Donahue;Zachary C. Lipton;Akshay Balsubramani;Julian McAuley", "authorids": "cdonahue@ucsd.edu;zlipton@cmu.edu;abalsubr@stanford.edu;jmcauley@cs.ucsd.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ndonahue2018semantically,\ntitle={Semantically Decomposing the Latent Spaces of Generative Adversarial Networks},\nauthor={Chris Donahue and Akshay Balsubramani and Julian McAuley and Zachary C. Lipton},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=S1nQvfgA-},\n}", "github": "[![github](/images/github_icon.svg) chrisdonahue/sdgan](https://github.com/chrisdonahue/sdgan)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 140, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8664262583947148240&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=S1nQvfgA-", "pdf": "https://openreview.net/pdf?id=S1nQvfgA-", "email": ";;;", "author_num": 4 }, { "id": "S1p31z-Ab", "title": "Deep contextualized word representations", "track": "main", "status": "Poster", "tldr": "We introduce a new type of deep contextualized word representation that significantly improves the state of the art for a range of challenging NLP tasks.", "abstract": "We introduce a new type of deep contextualized word representation that models both (1) complex characteristics of word use (e.g., syntax and semantics), and (2) how these uses vary across linguistic contexts (i.e., to model polysemy). Our word vectors are learned functions of the internal states of a deep bidirectional language model (biLM), which is pretrained on a large text corpus. We show that these representations can be easily added to existing models and significantly improve the state of the art across six challenging NLP problems, including question answering, textual entailment and sentiment analysis. We also present an analysis showing that exposing the deep internals of the pretrained network is crucial, allowing downstream models to mix different types of semi-supervision signals.\n", "keywords": "representation learning;contextualized word embeddings", "primary_area": "", "supplementary_material": "", "author": "Matthew E Peters;Mark Neumann;Mohit Iyyer;Matt Gardner;Christopher Clark;Kenton Lee;Luke Zettlemoyer", "authorids": "matthewp@allenai.org;markn@allenai.org;mohiti@allenai.org;mattg@allenai.org;csquared@cs.washington.edu;kentonl@cs.washington.edu;lsz@cs.washington.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@article{\ne2018deep,\ntitle={Deep contextualized word representations},\nauthor={Matthew E Peters, Mark Neumann, Mohit Iyyer, Matt Gardner, Christopher Clark, Kenton Lee, Luke Zettlemoyer},\njournal={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=S1p31z-Ab},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=S1p31z-Ab", "pdf_size": 0, "rating": "5;6;6", "confidence": "5;4;3", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 17, "authors#_avg": 7, "corr_rating_confidence": -0.8660254037844385, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "S1pWFzbAW", "title": "Weightless: Lossy Weight Encoding For Deep Neural Network Compression", "track": "main", "status": "Workshop", "tldr": "We propose a new way to compress neural networks using probabilistic data structures.", "abstract": "The large memory requirements of deep neural networks strain the capabilities of many devices, limiting their deployment and adoption. Model compression methods effectively reduce the memory requirements of these models, usually through applying transformations such as weight pruning or quantization. In this paper, we present a novel scheme for lossy weight encoding which complements conventional compression techniques. The encoding is based on the Bloomier filter, a probabilistic data structure that can save space at the cost of introducing random errors. Leveraging the ability of neural networks to tolerate these imperfections and by re-training around the errors, the proposed technique, Weightless, can compress DNN weights by up to 496x; with the same model accuracy, this results in up to a 1.51x improvement over the state-of-the-art.", "keywords": "Deep Neural Network;Compression;Sparsity", "primary_area": "", "supplementary_material": "", "author": "Brandon Reagen;Udit Gupta;Robert Adolf;Michael Mitzenmacher;Alexander Rush;Gu-Yeon Wei;David Brooks", "authorids": "reagen@fas.harvard.edu;ugupta@g.harvard.edu;rdadolf@seas.harvard.edu;michaelm@eecs.harvard.edu;srush@seas.harvard.edu;gywei@g.harvard.edu;dbrooks@eecs.harvard.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nreagen2018weightless,\ntitle={Weightless: Lossy Weight Encoding For Deep Neural Network Compression},\nauthor={Brandon Reagen and Udit Gupta and Robert Adolf and Michael Mitzenmacher and Alexander Rush and Gu-Yeon Wei and David Brooks},\nyear={2018},\nurl={https://openreview.net/forum?id=S1pWFzbAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=S1pWFzbAW", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15524028552507582001&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11 }, { "id": "S1q_Cz-Cb", "title": "Training Neural Machines with Partial Traces", "track": "main", "status": "Reject", "tldr": "We increase the amount of trace supervision possible to utilize when training fully differentiable neural machine architectures.", "abstract": "We present a novel approach for training neural abstract architectures which in- corporates (partial) supervision over the machine\u2019s interpretable components. To cleanly capture the set of neural architectures to which our method applies, we introduce the concept of a differential neural computational machine (\u2202NCM) and show that several existing architectures (e.g., NTMs, NRAMs) can be instantiated as a \u2202NCM and can thus benefit from any amount of additional supervision over their interpretable components. Based on our method, we performed a detailed experimental evaluation with both, the NTM and NRAM architectures, and showed that the approach leads to significantly better convergence and generalization capabilities of the learning phase than when training using only input-output examples.\n", "keywords": "Neural Abstract Machines;Neural Turing Machines;Neural Random Access Machines;Program Synthesis;Program Induction", "primary_area": "", "supplementary_material": "", "author": "Matthew Mirman;Dimitar Dimitrov;Pavle Djordjevich;Timon Gehr;Martin Vechev", "authorids": "matthew.mirman@inf.ethz.ch;dpavle@student.ethz.ch;dimitar.dimitrov@inf.ethz.ch;timon.gehr@inf.ethz.ch;martin.vechev@inf.ethz.ch", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nmirman2018training,\ntitle={Training Neural Machines with Partial Traces},\nauthor={Matthew Mirman and Dimitar Dimitrov and Pavle Djordjevich and Timon Gehr and Martin Vechev},\nyear={2018},\nurl={https://openreview.net/forum?id=S1q_Cz-Cb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=S1q_Cz-Cb", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;3;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:aon-GnA2UR8J:scholar.google.com/&scioq=Training+Neural+Machines+with+Partial+Traces&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "S1sRrN-CW", "title": "Revisiting Knowledge Base Embedding as Tensor Decomposition", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study the problem of knowledge base (KB) embedding, which is usually addressed through two frameworks---neural KB embedding and tensor decomposition. In this work, we theoretically analyze the neural embedding framework and subsequently connect it with tensor based embedding. Specifically, we show that in neural KB embedding the two commonly adopted optimization solutions---margin-based and negative sampling losses---are closely related to each other. We also reach the closed-form tensor that is implicitly approximated by popular neural KB approaches, revealing the underlying connection between neural and tensor based KB embedding models. Grounded in the theoretical results, we further present a tensor decomposition based framework KBTD to directly approximate the derived closed form tensor. Under this framework, the neural KB embedding models, such as NTN, TransE, Bilinear, and DISTMULT, are unified into a general tensor optimization architecture. Finally, we conduct experiments on the link prediction task in WordNet and Freebase, empirically demonstrating the effectiveness of the KBTD framework. \n", "keywords": "Knowledge base embedding", "primary_area": "", "supplementary_material": "", "author": "Jiezhong Qiu;Hao Ma;Yuxiao Dong;Kuansan Wang;Jie Tang", "authorids": "xptree@gmail.com;haoma@microsoft.com;yuxdong@microsoft.com;kuansanw@microsoft.com;jietang@tsinghua.edu.cn", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nqiu2018revisiting,\ntitle={Revisiting Knowledge Base Embedding as Tensor Decomposition},\nauthor={Jiezhong Qiu and Hao Ma and Yuxiao Dong and Kuansan Wang and Jie Tang},\nyear={2018},\nurl={https://openreview.net/forum?id=S1sRrN-CW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=S1sRrN-CW", "pdf_size": 0, "rating": "3;3;5", "confidence": "4;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17610106688503928219&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "title": "NerveNet: Learning Structured Policy with Graph Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/66", "id": "S1sqHMZCb", "author_site": "Tingwu Wang, Renjie Liao, Jimmy Ba, Sanja Fidler", "tldr": "using graph neural network to model structural information of the agents to improve policy and transferability ", "abstract": "We address the problem of learning structured policies for continuous control. In traditional reinforcement learning, policies of agents are learned by MLPs which take the concatenation of all observations from the environment as input for predicting actions. In this work, we propose NerveNet to explicitly model the structure of an agent, which naturally takes the form of a graph. Specifically, serving as the agent's policy network, NerveNet first propagates information over the structure of the agent and then predict actions for different parts of the agent. In the experiments, we first show that our NerveNet is comparable to state-of-the-art methods on standard MuJoCo environments. We further propose our customized reinforcement learning environments for benchmarking two types of structure transfer learning tasks, i.e., size and disability transfer. We demonstrate that policies learned by NerveNet are significantly better than policies learned by other models and are able to transfer even in a zero-shot setting.\n", "keywords": "reinforcement learning;transfer learning;graph neural network", "primary_area": "", "supplementary_material": "", "author": "Tingwu Wang;Renjie Liao;Jimmy Ba;Sanja Fidler", "authorids": "tingwuwang@cs.toronto.edu;rjliao@cs.toronto.edu;jimmy@psi.toronto.edu;fidler@cs.toronto.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nwang2018nervenet,\ntitle={NerveNet: Learning Structured Policy with Graph Neural Networks},\nauthor={Tingwu Wang and Renjie Liao and Jimmy Ba and Sanja Fidler},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=S1sqHMZCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;3;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.0, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 326, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15124575448090085151&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=S1sqHMZCb", "pdf": "https://openreview.net/pdf?id=S1sqHMZCb", "email": ";;;", "author_num": 4 }, { "id": "S1tWRJ-R-", "title": "Joint autoencoders: a flexible meta-learning framework", "track": "main", "status": "Reject", "tldr": "A generic framework for handling transfer and multi-task learning using pairs of autoencoders with task-specific and shared weights.", "abstract": "The incorporation of prior knowledge into learning is essential in achieving good performance based on small noisy samples. Such knowledge is often incorporated through the availability of related data arising from domains and tasks similar to the one of current interest. Ideally one would like to allow both the data for the current task and for previous related tasks to self-organize the learning system in such a way that commonalities and differences between the tasks are learned in a data-driven fashion. We develop a framework for learning multiple tasks simultaneously, based on sharing features that are common to all tasks, achieved through the use of a modular deep feedforward neural network consisting of shared branches, dealing with the common features of all tasks, and private branches, learning the specific unique aspects of each task. Once an appropriate weight sharing architecture has been established, learning takes place through standard algorithms for feedforward networks, e.g., stochastic gradient descent and its variations. The method deals with meta-learning (such as domain adaptation, transfer and multi-task learning) in a unified fashion, and can easily deal with data arising from different types of sources. Numerical experiments demonstrate the effectiveness of learning in domain adaptation and transfer learning setups, and provide evidence for the flexible and task-oriented representations arising in the network.", "keywords": "transfer learning;domain adaptation;unsupervised learning;autoencoders;multi-task learning", "primary_area": "", "supplementary_material": "", "author": "Baruch Epstein;Ron Meir;Tomer Michaeli", "authorids": "baruch.epstein@gmail.com;rmeir@ee.technion.ac.il;tomer.m@ee.technion.ac.il", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nepstein2018joint,\ntitle={Joint autoencoders: a flexible meta-learning framework},\nauthor={Baruch Epstein and Ron Meir and Tomer Michaeli},\nyear={2018},\nurl={https://openreview.net/forum?id=S1tWRJ-R-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=S1tWRJ-R-", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15183511324078278477&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "title": "Adaptive Dropout with Rademacher Complexity Regularization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/287", "id": "S1uxsye0Z", "author_site": "Ke Zhai, Huan Wang", "tldr": "We propose a novel framework to adaptively adjust the dropout rates for the deep neural network based on a Rademacher complexity bound.", "abstract": "We propose a novel framework to adaptively adjust the dropout rates for the deep neural network based on a Rademacher complexity bound. The state-of-the-art deep learning algorithms impose dropout strategy to prevent feature co-adaptation. However, choosing the dropout rates remains an art of heuristics or relies on empirical grid-search over some hyperparameter space. In this work, we show the network Rademacher complexity is bounded by a function related to the dropout rate vectors and the weight coefficient matrices. Subsequently, we impose this bound as a regularizer and provide a theoretical justified way to trade-off between model complexity and representation power. Therefore, the dropout rates and the empirical loss are unified into the same objective function, which is then optimized using the block coordinate descent algorithm. We discover that the adaptively adjusted dropout rates converge to some interesting distributions that reveal meaningful patterns.Experiments on the task of image and document classification also show our method achieves better performance compared to the state-of the-art dropout algorithms.", "keywords": "model complexity;regularization;deep learning;model generalization;adaptive dropout", "primary_area": "", "supplementary_material": "", "author": "Ke Zhai;Huan Wang", "authorids": "zhaikedavy@gmail.com;joyousprince@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nzhai2018adaptive,\ntitle={Adaptive Dropout with Rademacher Complexity Regularization},\nauthor={Ke Zhai and Huan Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=S1uxsye0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;3;5", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 22, "authors#_avg": 2, "corr_rating_confidence": 1.0, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6393473408345525754&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=S1uxsye0Z", "pdf": "https://openreview.net/pdf?id=S1uxsye0Z", "email": ";", "author_num": 2 }, { "title": "Unsupervised Representation Learning by Predicting Image Rotations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/227", "id": "S1v4N2l0-", "author_site": "Spyros Gidaris, Praveer Singh, Nikos Komodakis", "tldr": "", "abstract": "Over the last years, deep convolutional neural networks (ConvNets) have transformed the field of computer vision thanks to their unparalleled capacity to learn high level semantic image features. However, in order to successfully learn those features, they usually require massive amounts of manually labeled data, which is both expensive and impractical to scale. Therefore, unsupervised semantic feature learning, i.e., learning without requiring manual annotation effort, is of crucial importance in order to successfully harvest the vast amount of visual data that are available today. In our work we propose to learn image features by training ConvNets to recognize the 2d rotation that is applied to the image that it gets as input. We demonstrate both qualitatively and quantitatively that this apparently simple task actually provides a very powerful supervisory signal for semantic feature learning. We exhaustively evaluate our method in various unsupervised feature learning benchmarks and we exhibit in all of them state-of-the-art performance. Specifically, our results on those benchmarks demonstrate dramatic improvements w.r.t. prior state-of-the-art approaches in unsupervised representation learning and thus significantly close the gap with supervised feature learning. For instance, in PASCAL VOC 2007 detection task our unsupervised pre-trained AlexNet model achieves the state-of-the-art (among unsupervised methods) mAP of 54.4%$that is only 2.4 points lower from the supervised case. We get similarly striking results when we transfer our unsupervised learned features on various other tasks, such as ImageNet classification, PASCAL classification, PASCAL segmentation, and CIFAR-10 classification. The code and models of our paper will be published on:\nhttps://github.com/gidariss/FeatureLearningRotNet", "keywords": "Unsupervised representation learning", "primary_area": "", "supplementary_material": "", "author": "Spyros Gidaris;Praveer Singh;Nikos Komodakis", "authorids": "spyros.gidaris@enpc.fr;praveer.singh@enpc.fr;nikos.komodakis@enpc.fr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ngidaris2018unsupervised,\ntitle={Unsupervised Representation Learning by Predicting Image Rotations},\nauthor={Spyros Gidaris and Praveer Singh and Nikos Komodakis},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=S1v4N2l0-},\n}", "github": "[![github](/images/github_icon.svg) gidariss/FeatureLearningRotNet](https://github.com/gidariss/FeatureLearningRotNet) + [![Papers with Code](/images/pwc_icon.svg) 19 community implementations](https://paperswithcode.com/paper/?openreview=S1v4N2l0-)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;3;5", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 4239, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12748509220929577948&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13, "openreview": "https://openreview.net/forum?id=S1v4N2l0-", "pdf": "https://openreview.net/pdf?id=S1v4N2l0-", "email": ";;", "author_num": 3 }, { "id": "S1viikbCW", "title": "TCAV: Relative concept importance testing with Linear Concept Activation Vectors", "track": "main", "status": "Reject", "tldr": "This work aims to provide quantitative answers to the relative importance of concepts of interest via concept activation vectors (CAV). In particular, this framework enables non-machine learning experts to express concepts of interest and test hypotheses using examples (e.g., a set of pictures that illustrate the concept). We show that CAV can be learned given a relatively small set of examples. Hypothesis testing with CAV can answer whether a particular concept (e.g., gender) is more important in predicting a given class (e.g., doctor) than other sets of concepts. Interpreting networks with CAV does not require any retraining or modification of the network. ", "abstract": "Despite neural network\u2019s high performance, the lack of interpretability has been the main bottleneck for its safe usage in practice. In domains with high stakes (e.g., medical diagnosis), gaining insights into the network is critical for gaining trust and being adopted. One of the ways to improve interpretability of a NN is to explain the importance of a particular concept (e.g., gender) in prediction. This is useful for explaining reasoning behind the networks\u2019 predictions, and for revealing any biases the network may have. This work aims to provide quantitative answers to \\textit{the relative importance of concepts of interest} via concept activation vectors (CAV). In particular, this framework enables non-machine learning experts to express concepts of interests and test hypotheses using examples (e.g., a set of pictures that illustrate the concept). We show that CAV can be learned given a relatively small set of examples. Testing with CAV, for example, can answer whether a particular concept (e.g., gender) is more important in predicting a given class (e.g., doctor) than other set of concepts. Interpreting with CAV does not require any retraining or modification of the network. We show that many levels of meaningful concepts are learned (e.g., color, texture, objects, a person\u2019s occupation), and we present CAV\u2019s \\textit{empirical deepdream} \u2014 where we maximize an activation using a set of example pictures. We show how various insights can be gained from the relative importance testing with CAV.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Been Kim;Justin Gilmer;Martin Wattenberg;Fernanda Vi\u00e9gas", "authorids": "beenkim@google.com;viegas@google.com;wattenberg@google.com;gilmer@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nkim2018tcav,\ntitle={{TCAV}: Relative concept importance testing with Linear Concept Activation Vectors},\nauthor={Been Kim and Justin Gilmer and Martin Wattenberg and Fernanda Vi\u00e9gas},\nyear={2018},\nurl={https://openreview.net/forum?id=S1viikbCW},\n}", "github": "[![github](/images/github_icon.svg) tensorflow/tcav](https://github.com/tensorflow/tcav) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=S1viikbCW)", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1;AnonReviewer5", "site": "https://openreview.net/forum?id=S1viikbCW", "pdf_size": 0, "rating": "3;4;4;5", "confidence": "5;3;4;2", "rating_avg": 4.0, "confidence_avg": 3.5, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": -0.9486832980505138, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6807090879660572228&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Leave no Trace: Learning to Reset for Safe and Autonomous Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/118", "id": "S1vuO-bCW", "author_site": "Benjamin Eysenbach, Shixiang Gu, Julian Ibarz, Sergey Levine", "tldr": "We propose an autonomous method for safe and efficient reinforcement learning that simultaneously learns a forward and backward policy, with the backward policy resetting the environment for a subsequent attempt.", "abstract": "Deep reinforcement learning algorithms can learn complex behavioral skills, but real-world application of these methods requires a considerable amount of experience to be collected by the agent. In practical settings, such as robotics, this involves repeatedly attempting a task, resetting the environment between each attempt. However, not all tasks are easily or automatically reversible. In practice, this learning process requires considerable human intervention. In this work, we propose an autonomous method for safe and efficient reinforcement learning that simultaneously learns a forward and backward policy, with the backward policy resetting the environment for a subsequent attempt. By learning a value function for the backward policy, we can automatically determine when the forward policy is about to enter a non-reversible state, providing for uncertainty-aware safety aborts. Our experiments illustrate that proper use of the backward policy can greatly reduce the number of manual resets required to learn a task and can reduce the number of unsafe actions that lead to non-reversible states.", "keywords": "manual reset;continual learning;reinforcement learning;safety", "primary_area": "", "supplementary_material": "", "author": "Benjamin Eysenbach;Shixiang Gu;Julian Ibarz;Sergey Levine", "authorids": "eysenbach@google.com;sg717@cam.ac.uk;julianibarz@google.com;slevine@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\neysenbach2018leave,\ntitle={Leave no Trace: Learning to Reset for Safe and Autonomous Reinforcement Learning},\nauthor={Benjamin Eysenbach and Shixiang Gu and Julian Ibarz and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=S1vuO-bCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4;AnonReviewer3", "pdf_size": 0, "rating": "5;6;7;7", "confidence": "4;5;4;4", "rating_avg": 6.25, "confidence_avg": 4.25, "replies_avg": 15, "authors#_avg": 4, "corr_rating_confidence": -0.17407765595569782, "gs_citation": 183, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12404374759487598393&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=S1vuO-bCW", "pdf": "https://openreview.net/pdf?id=S1vuO-bCW", "email": ";;;", "author_num": 4 }, { "id": "S1xDcSR6W", "title": "Hybed: Hyperbolic Neural Graph Embedding", "track": "main", "status": "Reject", "tldr": "We learn neural embeddings of graphs in hyperbolic instead of Euclidean space", "abstract": "Neural embeddings have been used with great success in Natural Language Processing (NLP) where they provide compact representations that encapsulate word similarity and attain state-of-the-art performance in a range of linguistic tasks. The success of neural embeddings has prompted significant amounts of research into applications in domains other than language. One such domain is graph-structured data, where embeddings of vertices can be learned that encapsulate vertex similarity and improve performance on tasks including edge prediction and vertex labelling. For both NLP and graph-based tasks, embeddings in high-dimensional Euclidean spaces have been learned.\nHowever, recent work has shown that the appropriate isometric space for embedding complex networks is not the flat Euclidean space, but a negatively curved hyperbolic space. We present a new concept that exploits these recent insights and propose learning neural embeddings of graphs in hyperbolic space. We provide experimental evidence that hyperbolic embeddings significantly outperform Euclidean embeddings on vertex classification tasks for several real-world public datasets. ", "keywords": "embeddings;hyperbolic space;neural networks;geometry", "primary_area": "", "supplementary_material": "", "author": "Benjamin Paul Chamberlain;James R Clough;Marc Peter Deisenroth", "authorids": "benjamin.chamberlain@gmail.com;james.clough@kcl.ac.uk;m.deisenroth@imperial.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\npaul2018hybed,\ntitle={Hybed: Hyperbolic Neural Graph Embedding},\nauthor={Benjamin Paul Chamberlain and James R Clough and Marc Peter Deisenroth},\nyear={2018},\nurl={https://openreview.net/forum?id=S1xDcSR6W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=S1xDcSR6W", "pdf_size": 0, "rating": "4;4;5;7", "confidence": "3;3;3;2", "rating_avg": 5.0, "confidence_avg": 2.75, "replies_avg": 21, "authors#_avg": 3, "corr_rating_confidence": -0.9428090415820632, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-wakOfswt7IJ:scholar.google.com/&scioq=Hybed:+Hyperbolic+Neural+Graph+Embedding&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Mastering the Dungeon: Grounded Language Learning by Mechanical Turker Descent", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/176", "id": "SJ-C6JbRW", "author_site": "Zhilin Yang, Saizheng Zhang, Jack Urbanek, Will Feng, Alexander Miller, Arthur Szlam, Douwe Kiela, Jason Weston", "tldr": "", "abstract": "Contrary to most natural language processing research, which makes use of static datasets, humans learn language interactively, grounded in an environment. In this work we propose an interactive learning procedure called Mechanical Turker Descent (MTD) that trains agents to execute natural language commands grounded in a fantasy text adventure game. In MTD, Turkers compete to train better agents in the short term, and collaborate by sharing their agents' skills in the long term. This results in a gamified, engaging experience for the Turkers and a better quality teaching signal for the agents compared to static datasets, as the Turkers naturally adapt the training data to the agent's abilities.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhilin Yang;Saizheng Zhang;Jack Urbanek;Will Feng;Alexander Miller;Arthur Szlam;Douwe Kiela;Jason Weston", "authorids": "zhiliny@cs.cmu.edu;saizheng.zhang@umontreal.ca;jju@fb.com;willfeng@fb.com;ahm@fb.com;aszlam@fb.com;dkiela@fb.com;jase@fb.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\nyang2018mastering,\ntitle={Mastering the Dungeon: Grounded Language Learning by Mechanical Turker Descent},\nauthor={Zhilin Yang and Saizheng Zhang and Jack Urbanek and Will Feng and Alexander Miller and Arthur Szlam and Douwe Kiela and Jason Weston},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SJ-C6JbRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;4;5", "rating_avg": 7.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 8, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13176067834774442541&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=SJ-C6JbRW", "pdf": "https://openreview.net/pdf?id=SJ-C6JbRW", "email": ";;;;;;;", "author_num": 8 }, { "id": "SJ19eUg0-", "title": "BLOCK-DIAGONAL HESSIAN-FREE OPTIMIZATION FOR TRAINING NEURAL NETWORKS", "track": "main", "status": "Reject", "tldr": "", "abstract": "Second-order methods for neural network optimization have several advantages over methods based on first-order gradient descent, including better scaling to large mini-batch sizes and fewer updates needed for convergence. But they are rarely applied to deep learning in practice because of high computational cost and the need for model-dependent algorithmic variations. We introduce a vari- ant of the Hessian-free method that leverages a block-diagonal approximation of the generalized Gauss-Newton matrix. Our method computes the curvature approximation matrix only for pairs of parameters from the same layer or block of the neural network and performs conjugate gradient updates independently for each block. Experiments on deep autoencoders, deep convolutional networks, and multilayer LSTMs demonstrate better convergence and generalization compared to the original Hessian-free approach and the Adam method.", "keywords": "deep learning;second-order optimization;hessian free", "primary_area": "", "supplementary_material": "", "author": "Huishuai Zhang;Caiming Xiong;James Bradbury;Richard Socher", "authorids": "hzhan23@syr.edu;cxiong@salesforce.com;james.bradbury@salesforce.com;richard@socher.org", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhang2018blockdiagonal,\ntitle={{BLOCK}-{DIAGONAL} {HESSIAN}-{FREE} {OPTIMIZATION} {FOR} {TRAINING} {NEURAL} {NETWORKS}},\nauthor={Huishuai Zhang and Caiming Xiong and James Bradbury and Richard Socher},\nyear={2018},\nurl={https://openreview.net/forum?id=SJ19eUg0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SJ19eUg0-", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12711386451765241315&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "FearNet: Brain-Inspired Model for Incremental Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/81", "id": "SJ1Xmf-Rb", "author_site": "Ronald Kemker, Christopher Kanan", "tldr": "FearNet is a memory efficient neural-network, inspired by memory formation in the mammalian brain, that is capable of incremental class learning without catastrophic forgetting.", "abstract": "Incremental class learning involves sequentially learning classes in bursts of examples from the same class. This violates the assumptions that underlie methods for training standard deep neural networks, and will cause them to suffer from catastrophic forgetting. Arguably, the best method for incremental class learning is iCaRL, but it requires storing training examples for each class, making it challenging to scale. Here, we propose FearNet for incremental class learning. FearNet is a generative model that does not store previous examples, making it memory efficient. FearNet uses a brain-inspired dual-memory system in which new memories are consolidated from a network for recent memories inspired by the mammalian hippocampal complex to a network for long-term storage inspired by medial prefrontal cortex. Memory consolidation is inspired by mechanisms that occur during sleep. FearNet also uses a module inspired by the basolateral amygdala for determining which memory system to use for recall. FearNet achieves state-of-the-art performance at incremental class learning on image (CIFAR-100, CUB-200) and audio classification (AudioSet) benchmarks.\n", "keywords": "Incremental Learning;Lifelong Learning;Supervised Learning;Catastrophic Forgetting;Brain-Inspired;Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Ronald Kemker;Christopher Kanan", "authorids": "rmk6217@rit.edu;kanan@rit.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nkemker2018fearnet,\ntitle={FearNet: Brain-Inspired Model for Incremental Learning},\nauthor={Ronald Kemker and Christopher Kanan},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SJ1Xmf-Rb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "2;4;2", "rating_avg": 6.666666666666667, "confidence_avg": 2.6666666666666665, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 613, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3792376045058171047&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=SJ1Xmf-Rb", "pdf": "https://openreview.net/pdf?id=SJ1Xmf-Rb", "email": ";", "author_num": 2 }, { "id": "SJ1fQYlCZ", "title": "Training with Growing Sets: A Simple Alternative to Curriculum Learning and Self Paced Learning", "track": "main", "status": "Reject", "tldr": "We propose that training with growing sets stage-by-stage provides an optimization for neural networks.", "abstract": "Curriculum learning and Self paced learning are popular topics in the machine learning that suggest to put the training samples in order by considering their difficulty levels. Studies in these topics show that starting with a small training set and adding new samples according to difficulty levels improves the learning performance. In this paper we experimented that we can also obtain good results by adding the samples randomly without a meaningful order. We compared our method with classical training, Curriculum learning, Self paced learning and their reverse ordered versions. Results of the statistical tests show that the proposed method is better than classical method and similar with the others. These results point a new training regime that removes the process of difficulty level determination in Curriculum and Self paced learning and as successful as these methods.", "keywords": "Neural networks;Curriculum learning;Self paced learning", "primary_area": "", "supplementary_material": "", "author": "Melike Nur Mermer;Mehmet Fatih Amasyali", "authorids": "melike.mermer@izu.edu.tr;mfatih@ce.yildiz.edu.tr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nnur2018training,\ntitle={Training with Growing Sets: A Simple Alternative to Curriculum Learning and Self Paced Learning},\nauthor={Melike Nur Mermer and Mehmet Fatih Amasyali},\nyear={2018},\nurl={https://openreview.net/forum?id=SJ1fQYlCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SJ1fQYlCZ", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5687181151497706309&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Multi-Task Learning for Document Ranking and Query Suggestion", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/270", "id": "SJ1nzBeA-", "author_site": "Wasi Ahmad, Kai-Wei Chang, Hongning Wang", "tldr": "", "abstract": "We propose a multi-task learning framework to jointly learn document ranking and query suggestion for web search. It consists of two major components, a document ranker, and a query recommender. Document ranker combines current query and session information and compares the combined representation with document representation to rank the documents. Query recommender tracks users' query reformulation sequence considering all previous in-session queries using a sequence to sequence approach. As both tasks are driven by the users' underlying search intent, we perform joint learning of these two components through session recurrence, which encodes search context and intent. Extensive comparisons against state-of-the-art document ranking and query suggestion algorithms are performed on the public AOL search log, and the promising results endorse the effectiveness of the joint learning framework.", "keywords": "Multitask Learning;Document Ranking;Query Suggestion", "primary_area": "", "supplementary_material": "", "author": "Wasi Uddin Ahmad;Kai-Wei Chang;Hongning Wang", "authorids": "wasiahmad@cs.ucla.edu;kwchang@cs.ucla.edu;hw5x@virginia.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nuddin2018multitask,\ntitle={Multi-Task Learning for Document Ranking and Query Suggestion},\nauthor={Wasi Uddin Ahmad and Kai-Wei Chang and Hongning Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SJ1nzBeA-},\n}", "github": "[![github](/images/github_icon.svg) wasiahmad/mnsrf_ranking_suggestion](https://github.com/wasiahmad/mnsrf_ranking_suggestion)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 91, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14352356705152132006&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=SJ1nzBeA-", "pdf": "https://openreview.net/pdf?id=SJ1nzBeA-", "email": ";;", "author_num": 3 }, { "id": "SJ3dBGZ0Z", "title": "LSH Softmax: Sub-Linear Learning and Inference of the Softmax Layer in Deep Architectures", "track": "main", "status": "Reject", "tldr": "we present LSH Softmax, a softmax approximation layer for sub-linear learning and inference with strong theoretical guarantees; we showcase both its applicability and efficiency by evaluating on a real-world task: language modeling.", "abstract": "Log-linear models models are widely used in machine learning, and in particular are ubiquitous in deep learning architectures in the form of the softmax. While exact inference and learning of these requires linear time, it can be done approximately in sub-linear time with strong concentrations guarantees. In this work, we present LSH Softmax, a method to perform sub-linear learning and inference of the softmax layer in the deep learning setting. Our method relies on the popular Locality-Sensitive Hashing to build a well-concentrated gradient estimator, using nearest neighbors and uniform samples. We also present an inference scheme in sub-linear time for LSH Softmax using the Gumbel distribution. On language modeling, we show that Recurrent Neural Networks trained with LSH Softmax perform on-par with computing the exact softmax while requiring sub-linear computations.", "keywords": "LSH;softmax;deep;learning;sub;linear;efficient;GPU", "primary_area": "", "supplementary_material": "", "author": "Daniel Levy;Danlu Chan;Stefano Ermon", "authorids": "danilevy@cs.stanford.edu;taineleau@gmail.com;ermon@cs.stanford.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlevy2018lsh,\ntitle={{LSH} Softmax: Sub-Linear Learning and Inference of the Softmax Layer in Deep Architectures},\nauthor={Daniel Levy and Danlu Chan and Stefano Ermon},\nyear={2018},\nurl={https://openreview.net/forum?id=SJ3dBGZ0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJ3dBGZ0Z", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;3;4", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "SJ60SbW0b", "title": "Modeling Latent Attention Within Neural Networks", "track": "main", "status": "Reject", "tldr": "We develop a technique to visualize attention mechanisms in arbitrary neural networks. ", "abstract": "Deep neural networks are able to solve tasks across a variety of domains and modalities of data. Despite many empirical successes, we lack the ability to clearly understand and interpret the learned mechanisms that contribute to such effective behaviors and more critically, failure modes. In this work, we present a general method for visualizing an arbitrary neural network's inner mechanisms and their power and limitations. Our dataset-centric method produces visualizations of how a trained network attends to components of its inputs. The computed \"attention masks\" support improved interpretability by highlighting which input attributes are critical in determining output. We demonstrate the effectiveness of our framework on a variety of deep neural network architectures in domains from computer vision and natural language processing. The primary contribution of our approach is an interpretable visualization of attention that provides unique insights into the network's underlying decision-making process irrespective of the data modality.", "keywords": "deep learning;neural network;attention;attention mechanism;interpretability;visualization", "primary_area": "", "supplementary_material": "", "author": "Christopher Grimm;Dilip Arumugam;Siddharth Karamcheti;David Abel;Lawson L.S. Wong;Michael L. Littman", "authorids": "crgrimm@umich.edu;dilip_arumugam@brown.edu;siddharth_karamcheti@brown.edu;david_abel@brown.edu;lsw@brown.edu;mlittman@cs.brown.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\ngrimm2018modeling,\ntitle={Modeling Latent Attention Within Neural Networks},\nauthor={Christopher Grimm and Dilip Arumugam and Siddharth Karamcheti and David Abel and Lawson L.S. Wong and Michael L. Littman},\nyear={2018},\nurl={https://openreview.net/forum?id=SJ60SbW0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJ60SbW0b", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7142182764381082164&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SJ71VXZAZ", "title": "Learning To Generate Reviews and Discovering Sentiment", "track": "main", "status": "Reject", "tldr": "Byte-level recurrent language models learn high-quality domain specific representations of text.", "abstract": "We explore the properties of byte-level recurrent language models. When given sufficient amounts of capacity, training data, and compute time, the representations learned by these models include disentangled features corresponding to high-level concepts. Specifically, we find a single unit which performs sentiment analysis. These representations, learned in an unsupervised manner, achieve state of the art on the binary subset of the Stanford Sentiment Treebank. They are also very data efficient. When using only a handful of labeled examples, our approach matches the performance of strong baselines trained on full datasets. We also demonstrate the sentiment unit has a direct influence on the generative process of the model. Simply fixing its value to be positive or negative generates samples with the corresponding positive or negative sentiment.", "keywords": "unsupervised learning;representation learning;deep learning", "primary_area": "", "supplementary_material": "", "author": "Alec Radford;Rafal Jozefowicz;Ilya Sutskever", "authorids": "alec@openai.com;rafal@openai.com;ilya@openai.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nradford2018learning,\ntitle={Learning To Generate Reviews and Discovering Sentiment},\nauthor={Alec Radford and Rafal Jozefowicz and Ilya Sutskever},\nyear={2018},\nurl={https://openreview.net/forum?id=SJ71VXZAZ},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=SJ71VXZAZ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJ71VXZAZ", "pdf_size": 0, "rating": "2;4;4", "confidence": "5;5;3", "rating_avg": 3.3333333333333335, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": -0.5, "gs_citation": 621, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4855276330444042454&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "SJ8M9yup-", "title": "On Optimality Conditions for Auto-Encoder Signal Recovery", "track": "main", "status": "Reject", "tldr": "", "abstract": "Auto-Encoders are unsupervised models that aim to learn patterns from observed data by minimizing a reconstruction cost. The useful representations learned are often found to be sparse and distributed. On the other hand, compressed sensing and sparse coding assume a data generating process, where the observed data is generated from some true latent signal source, and try to recover the corresponding signal from measurements. Looking at auto-encoders from this signal recovery perspective enables us to have a more coherent view of these techniques. In this paper, in particular, we show that the true hidden representation can be approximately recovered if the weight matrices are highly incoherent with unit $ \\ell^{2} $ row length and the bias vectors takes the value (approximately) equal to the negative of the data mean. The recovery also becomes more and more accurate as the sparsity in hidden signals increases. Additionally, we empirically also demonstrate that auto-encoders are capable of recovering the data generating dictionary when only data samples are given.", "keywords": "Auto Encoder;Signal Recovery;Sparse Coding", "primary_area": "", "supplementary_material": "", "author": "Devansh Arpit;Yingbo Zhou;Hung Q. Ngo;Nils Napp;Venu Govindaraju", "authorids": "devansharpit@gmail.com;zybzmhhj@gmail.com;hungngo@buffalo.edu;nnapp@buffalo.edu;venu@cubs.buffalo.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\narpit2018on,\ntitle={On Optimality Conditions for Auto-Encoder Signal Recovery},\nauthor={Devansh Arpit and Yingbo Zhou and Hung Q. Ngo and Nils Napp and Venu Govindaraju},\nyear={2018},\nurl={https://openreview.net/forum?id=SJ8M9yup-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SJ8M9yup-", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 5, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15717345754884462846&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Sobolev GAN", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/98", "id": "SJA7xfb0b", "author_site": "Youssef Mroueh, Chun-Liang Li, Tom Sercu, Anant Raj, Yu Cheng", "tldr": "We define a new Integral Probability Metric (Sobolev IPM) and show how it can be used for training GANs for text generation and semi-supervised learning.", "abstract": "We propose a new Integral Probability Metric (IPM) between distributions: the Sobolev IPM. The Sobolev IPM compares the mean discrepancy of two distributions for functions (critic) restricted to a Sobolev ball defined with respect to a dominant measure mu. We show that the Sobolev IPM compares two distributions in high dimensions based on weighted conditional Cumulative Distribution Functions (CDF) of each coordinate on a leave one out basis. The Dominant measure mu plays a crucial role as it defines the support on which conditional CDFs are compared. Sobolev IPM can be seen as an extension of the one dimensional Von-Mises Cramer statistics to high dimensional distributions. We show how Sobolev IPM can be used to train Generative Adversarial Networks (GANs). We then exploit the intrinsic conditioning implied by Sobolev IPM in text generation. Finally we show that a variant of Sobolev GAN achieves competitive results in semi-supervised learning on CIFAR-10, thanks to the smoothness enforced on the critic by Sobolev GAN which relates to Laplacian regularization.", "keywords": "GAN theory;Integral Probability Metrics;elliptic PDE and diffusion;GAN for discrete sequences;semi-supervised learning.", "primary_area": "", "supplementary_material": "", "author": "Youssef Mroueh;Chun-Liang Li;Tom Sercu;Anant Raj;Yu Cheng", "authorids": "mroueh@us.ibm.com;chunlial@cs.cmu.edu;tom.sercu1@ibm.com;anant.raj@tuebingen.mpg.de;chengyu@us.ibm.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nmroueh2018sobolev,\ntitle={Sobolev {GAN}},\nauthor={Youssef Mroueh and Chun-Liang Li and Tom Sercu and Anant Raj and Yu Cheng},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SJA7xfb0b},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=SJA7xfb0b)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7;8", "confidence": "3;4;3;4", "rating_avg": 6.75, "confidence_avg": 3.5, "replies_avg": 21, "authors#_avg": 5, "corr_rating_confidence": 0.30151134457776363, "gs_citation": 160, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16587521411741023583&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=SJA7xfb0b", "pdf": "https://openreview.net/pdf?id=SJA7xfb0b", "email": ";;;;", "author_num": 5 }, { "id": "SJCPLLpaW", "title": "Exploring the Hidden Dimension in Accelerating Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "To the best of our knowledge, DeePa is the first deep learning framework that controls and optimizes the parallelism of CNNs in all parallelizable dimensions at the granularity of each layer.", "abstract": "DeePa is a deep learning framework that explores parallelism in all parallelizable dimensions to accelerate the training process of convolutional neural networks. DeePa optimizes parallelism at the granularity of each individual layer in the network. We present an elimination-based algorithm that finds an optimal parallelism configuration for every layer. Our evaluation shows that DeePa achieves up to 6.5\u00d7 speedup compared to state-of-the-art deep learning frameworks and reduces data transfers by up to 23\u00d7.", "keywords": "Parallelism of Convolutional Neural Networks;Accelerating Convolutional Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Zhihao Jia;Sina Lin;Charles R. Qi;Alex Aiken", "authorids": "zhihao@cs.stanford.edu;silin@microsoft.com;rqi@stanford.edu;aiken@cs.stanford.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\njia2018exploring,\ntitle={Exploring the Hidden Dimension in Accelerating Convolutional Neural Networks},\nauthor={Zhihao Jia and Sina Lin and Charles R. Qi and Alex Aiken},\nyear={2018},\nurl={https://openreview.net/forum?id=SJCPLLpaW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=SJCPLLpaW", "pdf_size": 0, "rating": "4;5;7", "confidence": "5;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": -0.7559289460184544, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3576848664502386850&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "SJCq_fZ0Z", "title": "Sparse Attentive Backtracking: Long-Range Credit Assignment in Recurrent Networks", "track": "main", "status": "Reject", "tldr": "Towards Efficient Credit Assignment in Recurrent Networks without Backpropagation Through Time", "abstract": "A major drawback of backpropagation through time (BPTT) is the difficulty of learning long-term dependencies, coming from having to propagate credit information backwards through every single step of the forward computation. This makes BPTT both computationally impractical and biologically implausible. For this reason, full backpropagation through time is rarely used on long sequences, and truncated backpropagation through time is used as a heuristic. However, this usually leads to biased estimates of the gradient in which longer term dependencies are ignored. Addressing this issue, we propose an alternative algorithm, Sparse Attentive Backtracking, which might also be related to principles used by brains to learn long-term dependencies. Sparse Attentive Backtracking learns an attention mechanism over the hidden states of the past and selectively backpropagates through paths with high attention weights. This allows the model to learn long term dependencies while only backtracking for a small number of time steps, not just from the recent past but also from attended relevant past states. ", "keywords": "recurrent neural networks;long-term dependencies;back-propagation through time;truncated back-propagation;biological inspiration;self-attention", "primary_area": "", "supplementary_material": "", "author": "Nan Rosemary Ke;Anirudh Goyal;Olexa Bilaniuk;Jonathan Binas;Laurent Charlin;Chris Pal;Yoshua Bengio", "authorids": "rosemary.nan.ke@gmail.com;anirudhgoyal9119@gmail.com;obilaniu@gmail.com;jbinas@gmail.com;lcharlin@gmail.com;chris.j.pal@gmail.com;yoshua.umontreal@gmail.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nrosemary2018sparse,\ntitle={Sparse Attentive Backtracking: Long-Range Credit Assignment in Recurrent Networks},\nauthor={Nan Rosemary Ke and Anirudh Goyal and Olexa Bilaniuk and Jonathan Binas and Laurent Charlin and Chris Pal and Yoshua Bengio},\nyear={2018},\nurl={https://openreview.net/forum?id=SJCq_fZ0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=SJCq_fZ0Z", "pdf_size": 0, "rating": "5;5;8", "confidence": "4;3;4", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 7, "corr_rating_confidence": 0.5, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=86721828185121584&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4 }, { "id": "SJD8YjCpW", "title": "Balanced and Deterministic Weight-sharing Helps Network Performance", "track": "main", "status": "Reject", "tldr": "Studied the role of weight sharing in neural networks using hash functions, found that a balanced and deterministic hash function helps network performance.", "abstract": "Weight-sharing plays a significant role in the success of many deep neural networks, by increasing memory efficiency and incorporating useful inductive priors about the problem into the network. But understanding how weight-sharing can be used effectively in general is a topic that has not been studied extensively. Chen et al. (2015) proposed HashedNets, which augments a multi-layer perceptron with a hash table, as a method for neural network compression. We generalize this method into a framework (ArbNets) that allows for efficient arbitrary weight-sharing, and use it to study the role of weight-sharing in neural networks. We show that common neural networks can be expressed as ArbNets with different hash functions. We also present two novel hash functions, the Dirichlet hash and the Neighborhood hash, and use them to demonstrate experimentally that balanced and deterministic weight-sharing helps with the performance of a neural network.", "keywords": "Weight-sharing;Weight sharing;Weight tying;neural networks;entropy;hash function;hash table;balance;sparse;sparsity;hashednets", "primary_area": "", "supplementary_material": "", "author": "Oscar Chang;Hod Lipson", "authorids": "oscar.chang@columbia.edu;hod.lipson@columbia.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nchang2018balanced,\ntitle={Balanced and Deterministic Weight-sharing Helps Network Performance},\nauthor={Oscar Chang and Hod Lipson},\nyear={2018},\nurl={https://openreview.net/forum?id=SJD8YjCpW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=SJD8YjCpW", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17634080090115660228&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "SJDJNzWAZ", "title": "Time-Dependent Representation for Neural Event Sequence Prediction", "track": "main", "status": "Workshop", "tldr": "Proposed methods for time-dependent event representation and regularization for sequence prediction; Evaluated these methods on five datasets that involve a range of sequence prediction tasks.", "abstract": "Existing sequence prediction methods are mostly concerned with time-independent sequences, in which the actual time span between events is irrelevant and the distance between events is simply the difference between their order positions in the sequence. While this time-independent view of sequences is applicable for data such as natural languages, e.g., dealing with words in a sentence, it is inappropriate and inefficient for many real world events that are observed and collected at unequally spaced points of time as they naturally arise, e.g., when a person goes to a grocery store or makes a phone call. The time span between events can carry important information about the sequence dependence of human behaviors. In this work, we propose a set of methods for using time in sequence prediction. Because neural sequence models such as RNN are more amenable for handling token-like input, we propose two methods for time-dependent event representation, based on the intuition on how time is tokenized in everyday life and previous work on embedding contextualization. We also introduce two methods for using next event duration as regularization for training a sequence prediction model. We discuss these methods based on recurrent neural nets. We evaluate these methods as well as baseline models on five datasets that resemble a variety of sequence prediction tasks. The experiments revealed that the proposed methods offer accuracy gain over baseline models in a range of settings.", "keywords": "Neural sequence prediction;Embedding;LSTM;Regularization", "primary_area": "", "supplementary_material": "", "author": "Yang Li;Nan Du;Samy Bengio", "authorids": "liyang@google.com;dunan@google.com;bengio@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nli2018timedependent,\ntitle={Time-Dependent Representation for Neural Event Sequence Prediction},\nauthor={Yang Li and Nan Du and Samy Bengio},\nyear={2018},\nurl={https://openreview.net/forum?id=SJDJNzWAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SJDJNzWAZ", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;5;3", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 85, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3807850542614666163&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "SJDYgPgCZ", "title": "Understanding Local Minima in Neural Networks by Loss Surface Decomposition", "track": "main", "status": "Reject", "tldr": "The loss surface of neural networks is a disjoint union of regions where every local minimum is a global minimum of the corresponding region.", "abstract": "To provide principled ways of designing proper Deep Neural Network (DNN) models, it is essential to understand the loss surface of DNNs under realistic assumptions. We introduce interesting aspects for understanding the local minima and overall structure of the loss surface. The parameter domain of the loss surface can be decomposed into regions in which activation values (zero or one for rectified linear units) are consistent. We found that, in each region, the loss surface have properties similar to that of linear neural networks where every local minimum is a global minimum. This means that every differentiable local minimum is the global minimum of the corresponding region. We prove that for a neural network with one hidden layer using rectified linear units under realistic assumptions. There are poor regions that lead to poor local minima, and we explain why such regions exist even in the overparameterized DNNs.", "keywords": "neural network;local minima;global minima;saddle point;optimization;loss surface;rectified linear unit;loss surface decomposition;gradient descent", "primary_area": "", "supplementary_material": "", "author": "Hanock Kwak;Byoung-Tak Zhang", "authorids": "hnkwak@bi.snu.ac.kr;btzhang@bi.snu.ac.kr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkwak2018understanding,\ntitle={Understanding Local Minima in Neural Networks by Loss Surface Decomposition},\nauthor={Hanock Kwak and Byoung-Tak Zhang},\nyear={2018},\nurl={https://openreview.net/forum?id=SJDYgPgCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJDYgPgCZ", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17548205009388062622&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "SJFM0ZWCb", "title": "Deep Temporal Clustering: Fully unsupervised learning of time-domain features", "track": "main", "status": "Reject", "tldr": "A fully unsupervised method, to naturally integrate dimensionality reduction and temporal clustering into a single end to end learning framework.", "abstract": "Unsupervised learning of timeseries data is a challenging problem in machine learning. Here, \nwe propose a novel algorithm, Deep Temporal Clustering (DTC), a fully unsupervised method, to naturally integrate dimensionality reduction and temporal clustering into a single end to end learning framework. The algorithm starts with an initial cluster estimates using an autoencoder for dimensionality reduction and a novel temporal clustering layer for cluster assignment. Then it jointly optimizes the clustering objective and the dimensionality reduction objective. Based on requirement and application, the temporal clustering layer can be customized with any temporal similarity metric. Several similarity metrics are considered and compared. To gain insight into features that the network has learned for its clustering, we apply a visualization method that generates a heat map of regions of interest in the timeseries. The viability of the algorithm is demonstrated using timeseries data from diverse domains, ranging from earthquakes to sensor data from spacecraft. In each case, we show that our algorithm outperforms traditional methods. This performance is attributed to fully integrated temporal dimensionality reduction and clustering criterion.", "keywords": "Unsupervised deep learning;Temporal clustering;Event Visualization", "primary_area": "", "supplementary_material": "", "author": "Naveen Sai Madiraju;Seid M. Sadat;Dimitry Fisher;Homa Karimabadi", "authorids": "naveen@avlab.ai;behnam@avlab.ai;dimitry@avlab.ai;homa@avlab.ai", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsai2018deep,\ntitle={Deep Temporal Clustering: Fully unsupervised learning of time-domain features},\nauthor={Naveen Sai Madiraju and Seid M. Sadat and Dimitry Fisher and Homa Karimabadi},\nyear={2018},\nurl={https://openreview.net/forum?id=SJFM0ZWCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SJFM0ZWCb", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;4;4", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 174, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18406976275324228078&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "SJIA6ZWC-", "title": "Stochastic Hyperparameter Optimization through Hypernetworks", "track": "main", "status": "Reject", "tldr": "We train a neural network to output approximately optimal weights as a function of hyperparameters.", "abstract": "Machine learning models are usually tuned by nesting optimization of model weights inside the optimization of hyperparameters. We give a method to collapse this nested optimization into joint stochastic optimization of both weights and hyperparameters. Our method trains a neural network to output approximately optimal weights as a function of hyperparameters. We show that our method converges to locally optimal weights and hyperparameters for sufficiently large hypernets. We compare this method to standard hyperparameter optimization strategies and demonstrate its effectiveness for tuning thousands of hyperparameters.", "keywords": "hypernetworks;hyperparameter optimization;metalearning;neural networks;Bayesian optimization;game theory;optimization", "primary_area": "", "supplementary_material": "", "author": "Jonathan Lorraine;David Duvenaud", "authorids": "lorraine@cs.toronto.edu;duvenaud@cs.toronto.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlorraine2018stochastic,\ntitle={Stochastic Hyperparameter Optimization through Hypernetworks},\nauthor={Jonathan Lorraine and David Duvenaud},\nyear={2018},\nurl={https://openreview.net/forum?id=SJIA6ZWC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SJIA6ZWC-", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;1;3", "rating_avg": 6.0, "confidence_avg": 2.6666666666666665, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 164, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15108084633119213570&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "SJICXeWAb", "title": "Depth separation and weight-width trade-offs for sigmoidal neural networks", "track": "main", "status": "Reject", "tldr": "depth-2-vs-3 separation for sigmoidal neural networks over general distributions", "abstract": "Some recent work has shown separation between the expressive power of depth-2 and depth-3 neural networks. These separation results are shown by constructing functions and input distributions, so that the function is well-approximable by a depth-3 neural network of polynomial size but it cannot be well-approximated under the chosen input distribution by any depth-2 neural network of polynomial size. These results are not robust and require carefully chosen functions as well as input distributions.\n\nWe show a similar separation between the expressive power of depth-2 and depth-3 sigmoidal neural networks over a large class of input distributions, as long as the weights are polynomially bounded. While doing so, we also show that depth-2 sigmoidal neural networks with small width and small weights can be well-approximated by low-degree multivariate polynomials.", "keywords": "depth separation;neural networks;weights-width trade-off", "primary_area": "", "supplementary_material": "", "author": "Amit Deshpande;Navin Goyal;Sushrut Karmalkar", "authorids": "amitdesh@microsoft.com;navingo@microsoft.com;sushrutk@cs.utexas.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ndeshpande2018depth,\ntitle={Depth separation and weight-width trade-offs for sigmoidal neural networks},\nauthor={Amit Deshpande and Navin Goyal and Sushrut Karmalkar},\nyear={2018},\nurl={https://openreview.net/forum?id=SJICXeWAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SJICXeWAb", "pdf_size": 0, "rating": "3;5;6", "confidence": "5;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": -0.944911182523068, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14634036276556508521&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "title": "Hierarchical and Interpretable Skill Acquisition in Multi-task Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/128", "id": "SJJQVZW0b", "author_site": "Tianmin Shu, Caiming Xiong, richard socher", "tldr": "A novel hierarchical policy network which can reuse previously learned skills alongside and as subcomponents of new skills by discovering the underlying relations between skills.", "abstract": "Learning policies for complex tasks that require multiple different skills is a major challenge in reinforcement learning (RL). It is also a requirement for its deployment in real-world scenarios. This paper proposes a novel framework for efficient multi-task reinforcement learning. Our framework trains agents to employ hierarchical policies that decide when to use a previously learned policy and when to learn a new skill. This enables agents to continually acquire new skills during different stages of training. Each learned task corresponds to a human language description. Because agents can only access previously learned skills through these descriptions, the agent can always provide a human-interpretable description of its choices. In order to help the agent learn the complex temporal dependencies necessary for the hierarchical policy, we provide it with a stochastic temporal grammar that modulates when to rely on previously learned skills and when to execute new skills. We validate our approach on Minecraft games designed to explicitly test the ability to reuse previously learned skills while simultaneously learning new skills.", "keywords": "Hierarchical Policy;Interpretable Policy;Deep Reinforcement Learning;Multi-task Reinforcement Learning;Skill Acquisition;Language Grounding", "primary_area": "", "supplementary_material": "", "author": "Tianmin Shu;Caiming Xiong;Richard Socher", "authorids": "tianmin.shu@ucla.edu;cxiong@salesforce.com;richard@socher.org", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nshu2018hierarchical,\ntitle={Hierarchical and Interpretable Skill Acquisition in Multi-task Reinforcement Learning},\nauthor={Tianmin Shu and Caiming Xiong and Richard Socher},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SJJQVZW0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;3;3", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 194, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7462925893292154188&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=SJJQVZW0b", "pdf": "https://openreview.net/pdf?id=SJJQVZW0b", "email": ";;", "author_num": 3 }, { "title": "Model-Ensemble Trust-Region Policy Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/111", "id": "SJJinbWRZ", "author_site": "Thanard Kurutach, Ignasi Clavera, Yan Duan, Aviv Tamar, Pieter Abbeel", "tldr": "Deep Model-Based RL that works well.", "abstract": "Model-free reinforcement learning (RL) methods are succeeding in a growing number of tasks, aided by recent advances in deep learning. However, they tend to suffer from high sample complexity, which hinders their use in real-world domains. Alternatively, model-based reinforcement learning promises to reduce sample complexity, but tends to require careful tuning and to date have succeeded mainly in restrictive domains where simple models are sufficient for learning. In this paper, we analyze the behavior of vanilla model-based reinforcement learning methods when deep neural networks are used to learn both the model and the policy, and show that the learned policy tends to exploit regions where insufficient data is available for the model to be learned, causing instability in training. To overcome this issue, we propose to use an ensemble of models to maintain the model uncertainty and regularize the learning process. We further show that the use of likelihood ratio derivatives yields much more stable learning than backpropagation through time. Altogether, our approach Model-Ensemble Trust-Region Policy Optimization (ME-TRPO) significantly reduces the sample complexity compared to model-free deep RL methods on challenging continuous control benchmark tasks.", "keywords": "model-based reinforcement learning;model ensemble;reinforcement learning;model bias", "primary_area": "", "supplementary_material": "", "author": "Thanard Kurutach;Ignasi Clavera;Yan Duan;Aviv Tamar;Pieter Abbeel", "authorids": "thanard.kurutach@berkeley.edu;iclavera@berkeley.edu;rockyduan@eecs.berkeley.edu;avivt@berkeley.edu;pabbeel@cs.berkeley.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nkurutach2018modelensemble,\ntitle={Model-Ensemble Trust-Region Policy Optimization},\nauthor={Thanard Kurutach and Ignasi Clavera and Yan Duan and Aviv Tamar and Pieter Abbeel},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SJJinbWRZ},\n}", "github": "[![github](/images/github_icon.svg) thanard/me-trpo](https://github.com/thanard/me-trpo) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=SJJinbWRZ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;5;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 21, "authors#_avg": 5, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 592, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5763230631763342838&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=SJJinbWRZ", "pdf": "https://openreview.net/pdf?id=SJJinbWRZ", "email": ";;;;", "author_num": 5 }, { "title": "Training GANs with Optimism", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/127", "id": "SJJySbbAZ", "author_site": "Constantinos C Daskalakis, Andrew Ilyas, Vasilis Syrgkanis, Haoyang Zeng", "tldr": "We propose the use of optimistic mirror decent to address cycling problems in the training of GANs. We also introduce the Optimistic Adam algorithm", "abstract": "We address the issue of limit cycling behavior in training Generative Adversarial Networks and propose the use of Optimistic Mirror Decent (OMD) for training Wasserstein GANs. Recent theoretical results have shown that optimistic mirror decent (OMD) can enjoy faster regret rates in the context of zero-sum games. WGANs is exactly a context of solving a zero-sum game with simultaneous no-regret dynamics. Moreover, we show that optimistic mirror decent addresses the limit cycling problem in training WGANs. We formally show that in the case of bi-linear zero-sum games the last iterate of OMD dynamics converges to an equilibrium, in contrast to GD dynamics which are bound to cycle. We also portray the huge qualitative difference between GD and OMD dynamics with toy examples, even when GD is modified with many adaptations proposed in the recent literature, such as gradient penalty or momentum. We apply OMD WGAN training to a bioinformatics problem of generating DNA sequences. We observe that models trained with OMD achieve consistently smaller KL divergence with respect to the true underlying distribution, than models trained with GD variants. Finally, we introduce a new algorithm, Optimistic Adam, which is an optimistic variant of Adam. We apply it to WGAN training on CIFAR10 and observe improved performance in terms of inception score as compared to Adam.", "keywords": "GANs;Optimistic Mirror Decent;Cycling;Last Iterate Convergence;Optimistic Adam", "primary_area": "", "supplementary_material": "", "author": "Constantinos Daskalakis;Andrew Ilyas;Vasilis Syrgkanis;Haoyang Zeng", "authorids": "costis@mit.edu;ailyas@mit.edu;vasy@microsoft.com;haoyangz@mit.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ndaskalakis2018training,\ntitle={Training {GAN}s with Optimism},\nauthor={Constantinos Daskalakis and Andrew Ilyas and Vasilis Syrgkanis and Haoyang Zeng},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SJJySbbAZ},\n}", "github": "[![github](/images/github_icon.svg) vsyrgkanis/optimistic_GAN_training](https://github.com/vsyrgkanis/optimistic_GAN_training)", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;4;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 624, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=721555332302459217&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=SJJySbbAZ", "pdf": "https://openreview.net/pdf?id=SJJySbbAZ", "email": ";;;", "author_num": 4 }, { "title": "Understanding image motion with group representations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/84", "id": "SJLlmG-AZ", "author_site": "Andrew Jaegle, Stephen Phillips, Daphne Ippolito, Kostas Daniilidis", "tldr": "We propose of method of using group properties to learn a representation of motion without labels and demonstrate the use of this method for representing 2D and 3D motion.", "abstract": "Motion is an important signal for agents in dynamic environments, but learning to represent motion from unlabeled video is a difficult and underconstrained problem. We propose a model of motion based on elementary group properties of transformations and use it to train a representation of image motion. While most methods of estimating motion are based on pixel-level constraints, we use these group properties to constrain the abstract representation of motion itself. We demonstrate that a deep neural network trained using this method captures motion in both synthetic 2D sequences and real-world sequences of vehicle motion, without requiring any labels. Networks trained to respect these constraints implicitly identify the image characteristic of motion in different sequence types. In the context of vehicle motion, this method extracts information useful for localization, tracking, and odometry. Our results demonstrate that this representation is useful for learning motion in the general setting where explicit labels are difficult to obtain.", "keywords": "vision;motion;recurrent neural networks;self-supervised learning;unsupervised learning;group theory", "primary_area": "", "supplementary_material": "", "author": "Andrew Jaegle;Stephen Phillips;Daphne Ippolito;Kostas Daniilidis", "authorids": "ajaegle@upenn.edu;stephi@seas.upenn.edu;daphnei@seas.upenn.edu;kostas@seas.upenn.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\njaegle2018understanding,\ntitle={Understanding image motion with group representations },\nauthor={Andrew Jaegle and Stephen Phillips and Daphne Ippolito and Kostas Daniilidis},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SJLlmG-AZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": -0.944911182523068, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4075741295036546048&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=SJLlmG-AZ", "pdf": "https://openreview.net/pdf?id=SJLlmG-AZ", "email": ";;;", "author_num": 4 }, { "id": "SJLy_SxC-", "title": "Log-DenseNet: How to Sparsify a DenseNet", "track": "main", "status": "Reject", "tldr": "We show shortcut connections should be placed in patterns that minimize between-layer distances during backpropagation, and design networks that achieve log L distances using L log(L) connections.", "abstract": "Skip connections are increasingly utilized by deep neural networks to improve accuracy and cost-efficiency. In particular, the recent DenseNet is efficient in computation and parameters, and achieves state-of-the-art predictions by directly connecting each feature layer to all previous ones. However, DenseNet's extreme connectivity pattern may hinder its scalability to high depths, and in applications like fully convolutional networks, full DenseNet connections are prohibitively expensive. \nThis work first experimentally shows that one key advantage of skip connections is to have short distances among feature layers during backpropagation. Specifically, using a fixed number of skip connections, the connection patterns with shorter backpropagation distance among layers have more accurate predictions. Following this insight, we propose a connection template, Log-DenseNet, which, in comparison to DenseNet, only slightly increases the backpropagation distances among layers from 1 to ($1 + \\log_2 L$), but uses only $L\\log_2 L$ total connections instead of $O(L^2)$. Hence, \\logdenses are easier to scale than DenseNets, and no longer require careful GPU memory management. We demonstrate the effectiveness of our design principle by showing better performance than DenseNets on tabula rasa semantic segmentation, and competitive results on visual recognition.", "keywords": "DenseNet;sparse shortcut connections;network architecture;scene parsing;image classification", "primary_area": "", "supplementary_material": "", "author": "Hanzhang Hu;Debadeepta Dey;Allie Del Giorno;Martial Hebert;J. Andrew Bagnell", "authorids": "hanzhang@cs.cmu.edu;dedey@microsoft.com;adelgior@ri.cmu.edu;hebert@ri.cmu.edu;dbagnell@ri.cmu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nhu2018logdensenet,\ntitle={Log-DenseNet: How to Sparsify a DenseNet},\nauthor={Hanzhang Hu and Debadeepta Dey and Allie Del Giorno and Martial Hebert and J. Andrew Bagnell},\nyear={2018},\nurl={https://openreview.net/forum?id=SJLy_SxC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SJLy_SxC-", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13227732818616320913&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "SJOl4DlCZ", "title": "Classifier-to-Generator Attack: Estimation of Training Data Distribution from Classifier", "track": "main", "status": "Reject", "tldr": "Estimation of training data distribution from trained classifier using GAN.", "abstract": "Suppose a deep classification model is trained with samples that need to be kept private for privacy or confidentiality reasons. In this setting, can an adversary obtain the private samples if the classification model is given to the adversary? We call this reverse engineering against the classification model the Classifier-to-Generator (C2G) Attack. This situation arises when the classification model is embedded into mobile devices for offline prediction (e.g., object recognition for the automatic driving car and face recognition for mobile phone authentication).\nFor C2G attack, we introduce a novel GAN, PreImageGAN. In PreImageGAN, the generator is designed to estimate the the sample distribution conditioned by the preimage of classification model $f$, $P(X|f(X)=y)$, where $X$ is the random variable on the sample space and $y$ is the probability vector representing the target label arbitrary specified by the adversary. In experiments, we demonstrate PreImageGAN works successfully with hand-written character recognition and face recognition. In character recognition, we show that, given a recognition model of hand-written digits, PreImageGAN allows the adversary to extract alphabet letter images without knowing that the model is built for alphabet letter images. In face recognition, we show that, when an adversary obtains a face recognition model for a set of individuals, PreImageGAN allows the adversary to extract face images of specific individuals contained in the set, even when the adversary has no knowledge of the face of the individuals.", "keywords": "Security;Privacy;Model Publication;Generative Adversarial Networks", "primary_area": "", "supplementary_material": "", "author": "Kosuke Kusano;Jun Sakuma", "authorids": "cocuh@mdl.cs.tsukuba.ac.jp;jun@cs.tsukuba.ac.jp", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkusano2018classifiertogenerator,\ntitle={Classifier-to-Generator Attack: Estimation of Training Data Distribution from Classifier},\nauthor={Kosuke Kusano and Jun Sakuma},\nyear={2018},\nurl={https://openreview.net/forum?id=SJOl4DlCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJOl4DlCZ", "pdf_size": 0, "rating": "4;4;7", "confidence": "3;3;3", "rating_avg": 5.0, "confidence_avg": 3.0, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11106712300940667614&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SJPO7JMyG", "title": "Automatic Measurement on Etched Structure in Semiconductor Using Deep Learning Approach", "track": "main", "status": "Withdraw", "tldr": "Using deep learning method to carry out automatic measurement of SEM images in semiconductor industry", "abstract": "The fabrication of semiconductor involves etching process to remove selected areas from wafers. However, the measurement of etched structure in micro-graph heavily relies on time-consuming manual routines. Traditional image processing usually demands on large number of annotated data and the performance is still poor. We treat this challenge as segmentation problem and use deep learning approach to detect masks of objects in etched structure of wafer. Then, we use simple image processing to carry out automatic measurement on the objects. We attempt Generative Adversarial Network (GAN) to generate more data to overcome the problem of very limited dataset. We download 10 SEM (Scanning Electron Microscope) images of 4 types from Internet, based on which we carry out our experiments. Our deep learning based method demonstrates superiority over image processing approach with mean accuracy reaching over 96% for the measurements, compared with the ground truth. To the best of our knowledge, it is the first time that deep learning has been applied in semiconductor industry for automatic measurement.", "keywords": "Deep learning;segmentation;automatic measurement;semiconductor;Scanning Electron Microscope", "primary_area": "", "supplementary_material": "", "author": "Anonymous", "authorids": "ICLR.cc/2018/Conference/Paper327/Authors", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@article{\n anonymous2018automatic,\n title={Automatic Measurement on Etched Structure in Semiconductor Using Deep Learning Approach},\n author={Anonymous},\n journal={International Conference on Learning Representations},\n year={2018},\n url={https://openreview.net/forum?id=r1HKJYeRb}\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=SJPO7JMyG", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 0, "authors#_avg": 1, "corr_rating_confidence": 0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "SJPpHzW0-", "title": "Influence-Directed Explanations for Deep Convolutional Networks", "track": "main", "status": "Reject", "tldr": "We present an influence-directed approach to constructing explanations for the behavior of deep convolutional networks, and show how it can be used to answer a broad set of questions that could not be addressed by prior work.", "abstract": "We study the problem of explaining a rich class of behavioral properties of deep neural networks. Our influence-directed explanations approach this problem by peering inside the network to identify neurons with high influence on the property of interest using an axiomatically justified influence measure, and then providing an interpretation for the concepts these neurons represent. We evaluate our approach by training convolutional neural networks on Pubfig, ImageNet, and Diabetic Retinopathy datasets. Our evaluation demonstrates that influence-directed explanations (1) localize features used by the network, (2) isolate features distinguishing related instances, (3) help extract the essence of what the network learned about the class, and (4) assist in debugging misclassifications.\n", "keywords": "Deep neural networks;convolutional networks;influence measures;explanations", "primary_area": "", "supplementary_material": "", "author": "Anupam Datta;Matt Fredrikson;Klas Leino;Linyi Li;Shayak Sen", "authorids": "danupam@cmu.edu;mfredrik@cs.cmu.edu;kleino@cs.cmu.edu;ly-li14@mails.tsinghua.edu.cn;shayaks@cs.cmu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ndatta2018influencedirected,\ntitle={Influence-Directed Explanations for Deep Convolutional Networks},\nauthor={Anupam Datta and Matt Fredrikson and Klas Leino and Linyi Li and Shayak Sen},\nyear={2018},\nurl={https://openreview.net/forum?id=SJPpHzW0-},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=SJPpHzW0-)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SJPpHzW0-", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;3;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 5, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 96, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5504720312490252443&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "title": "Quantitatively Evaluating GANs With Divergences Proposed for Training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/41", "id": "SJQHjzZ0-", "author_site": "Daniel Im, He Ma, Graham W Taylor, Kristin Branson", "tldr": "An empirical evaluation on generative adversarial networks", "abstract": "Generative adversarial networks (GANs) have been extremely effective in approximating complex distributions of high-dimensional, input data samples, and substantial progress has been made in understanding and improving GAN performance in terms of both theory and application. \nHowever, we currently lack quantitative methods for model assessment. Because of this, while many GAN variants being proposed, we have relatively little understanding of their relative abilities. In this paper, we evaluate the performance of various types of GANs using divergence and distance functions typically used only for training. We observe consistency across the various proposed metrics and, interestingly, the test-time metrics do not favour networks that use the same training-time criterion. We also compare the proposed metrics to human perceptual scores.", "keywords": "Generative adversarial networks", "primary_area": "", "supplementary_material": "", "author": "Daniel Jiwoong Im;He Ma;Graham W. Taylor;Kristin Branson", "authorids": "daniel.im@aifounded.com;hma02@uoguelph.ca;gwtaylor@uoguelph.ca;kristinbranson@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\njiwoong2018quantitatively,\ntitle={Quantitatively Evaluating {GAN}s With Divergences Proposed for Training},\nauthor={Daniel Jiwoong Im and Alllan He Ma and Graham W. Taylor and Kristin Branson},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SJQHjzZ0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "4;7;7", "confidence": "3;5;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 14, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844387, "gs_citation": 84, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4962866391557443877&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=SJQHjzZ0-", "pdf": "https://openreview.net/pdf?id=SJQHjzZ0-", "email": ";;;", "author_num": 4 }, { "id": "SJQO7UJCW", "title": "Adversarial Learning for Semi-Supervised Semantic Segmentation", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a method for semi-supervised semantic segmentation using the adversarial network. While most existing discriminators are trained to classify input images as real or fake on the image level, we design a discriminator in a fully convolutional manner to differentiate the predicted probability maps from the ground truth segmentation distribution with the consideration of the spatial resolution. We show that the proposed discriminator can be used to improve the performance on semantic segmentation by coupling the adversarial loss with the standard cross entropy loss on the segmentation network. In addition, the fully convolutional discriminator enables the semi-supervised learning through discovering the trustworthy regions in prediction results of unlabeled images, providing additional supervisory signals. In contrast to existing methods that utilize weakly-labeled images, our method leverages unlabeled images without any annotation to enhance the segmentation model. Experimental results on both the PASCAL VOC 2012 dataset and the Cityscapes dataset demonstrate the effectiveness of our algorithm.", "keywords": "semantic segmentation;adversarial learning;semi-supervised learning;self-taught learning", "primary_area": "", "supplementary_material": "", "author": "Wei-Chih Hung;Yi-Hsuan Tsai;Yan-Ting Liou;Yen-Yu Lin;Ming-Hsuan Yang", "authorids": "whung8@ucmerced.edu;ytsai@nec-labs.com;lyt@csie.ntu.edu.tw;yylin@citi.sinica.edu.tw;mhyang@ucmerced.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nhung2018adversarial,\ntitle={Adversarial Learning for Semi-Supervised Semantic Segmentation},\nauthor={Wei-Chih Hung and Yi-Hsuan Tsai and Yan-Ting Liou and Yen-Yu Lin and Ming-Hsuan Yang},\nyear={2018},\nurl={https://openreview.net/forum?id=SJQO7UJCW},\n}", "github": "[![github](/images/github_icon.svg) hfslyc/AdvSemiSeg](https://github.com/hfslyc/AdvSemiSeg) + [![Papers with Code](/images/pwc_icon.svg) 12 community implementations](https://paperswithcode.com/paper/?openreview=SJQO7UJCW)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJQO7UJCW", "pdf_size": 0, "rating": "5;5;5", "confidence": "5;4;4", "rating_avg": 5.0, "confidence_avg": 4.333333333333333, "replies_avg": 15, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 761, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5189989796829982628&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11 }, { "id": "SJSVuReCZ", "title": "SHADE: SHAnnon DEcay Information-Based Regularization for Deep Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Regularization is a big issue for training deep neural networks. In this paper, we propose a new information-theory-based regularization scheme named SHADE for SHAnnon DEcay. The originality of the approach is to define a prior based on conditional entropy, which explicitly decouples the learning of invariant representations in the regularizer and the learning of correlations between inputs and labels in the data fitting term. We explain why this quantity makes our model able to achieve invariance with respect to input variations. We empirically validate the efficiency of our approach to improve classification performances compared to standard regularization schemes on several standard architectures.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Michael Blot;Thomas Robert;Nicolas Thome;Matthieu Cord", "authorids": "michael.blot@lip6.fr;thomas.robert@lip6.fr;nicolas.thome@lip6.fr;matthieu.cord@lip6.fr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nblot2018shade,\ntitle={{SHADE}: {SHA}nnon {DE}cay Information-Based Regularization for Deep Learning},\nauthor={Michael Blot and Thomas Robert and Nicolas Thome and Matthieu Cord},\nyear={2018},\nurl={https://openreview.net/forum?id=SJSVuReCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SJSVuReCZ", "pdf_size": 0, "rating": "4;4;5;7", "confidence": "3;3;4;3", "rating_avg": 5.0, "confidence_avg": 3.25, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2298302489088552172&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "SJTB5GZCb", "title": "Extending the Framework of Equilibrium Propagation to General Dynamics", "track": "main", "status": "Workshop", "tldr": "We describe a biologically plausible learning algorithm for fixed point recurrent networks without tied weights", "abstract": "The biological plausibility of the backpropagation algorithm has long been doubted by neuroscientists. Two major reasons are that neurons would need to send two different types of signal in the forward and backward phases, and that pairs of neurons would need to communicate through symmetric bidirectional connections.\nWe present a simple two-phase learning procedure for fixed point recurrent networks that addresses both these issues.\nIn our model, neurons perform leaky integration and synaptic weights are updated through a local mechanism.\nOur learning method extends the framework of Equilibrium Propagation to general dynamics, relaxing the requirement of an energy function.\nAs a consequence of this generalization, the algorithm does not compute the true gradient of the objective function,\nbut rather approximates it at a precision which is proven to be directly related to the degree of symmetry of the feedforward and feedback weights.\nWe show experimentally that the intrinsic properties of the system lead to alignment of the feedforward and feedback weights, and that our algorithm optimizes the objective function.", "keywords": "Deep Learning;Backpropagation;Fixed Point Recurrent Neural Network;Biologically Plausible Learning;Feedback Alignment;Dynamical System;Gradient-Free Optimization", "primary_area": "", "supplementary_material": "", "author": "Benjamin Scellier;Anirudh Goyal;Jonathan Binas;Thomas Mesnard;Yoshua Bengio", "authorids": "benjamin.scellier@polytechnique.edu;anirudhgoyal9119@gmail.com;jbinas@gmail.com;thomas.mesnard@gmail.com;yoshua.umontreal@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nscellier2018extending,\ntitle={Extending the Framework of Equilibrium Propagation to General Dynamics},\nauthor={Benjamin Scellier and Anirudh Goyal and Jonathan Binas and Thomas Mesnard and Yoshua Bengio},\nyear={2018},\nurl={https://openreview.net/forum?id=SJTB5GZCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=SJTB5GZCb", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;4;2", "rating_avg": 4.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 7, "authors#_avg": 5, "corr_rating_confidence": -0.944911182523068, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15673369876435147928&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "SJUX_MWCZ", "title": "Predict Responsibly: Increasing Fairness by Learning to Defer", "track": "main", "status": "Workshop", "tldr": "Incorporating the ability to say I-don't-know can improve the fairness of a classifier without sacrificing too much accuracy, and this improvement magnifies when the classifier has insight into downstream decision-making.", "abstract": "When machine learning models are used for high-stakes decisions, they should predict accurately, fairly, and responsibly. To fulfill these three requirements, a model must be able to output a reject option (i.e. say \"``I Don't Know\") when it is not qualified to make a prediction. In this work, we propose learning to defer, a method by which a model can defer judgment to a downstream decision-maker such as a human user. We show that learning to defer generalizes the rejection learning framework in two ways: by considering the effect of other agents in the decision-making process, and by allowing for optimization of complex objectives. We propose a learning algorithm which accounts for potential biases held by decision-makerslater in a pipeline. Experiments on real-world datasets demonstrate that learning\nto defer can make a model not only more accurate but also less biased. Even when\noperated by highly biased users, we show that\ndeferring models can still greatly improve the fairness of the entire pipeline.", "keywords": "Fairness;IDK;Calibration;Automated decision-making;Transparency;Accountability", "primary_area": "", "supplementary_material": "", "author": "David Madras;Toniann Pitassi;Richard Zemel", "authorids": "david.madras@mail.utoronto.ca;zemel@cs.toronto.edu;toni@cs.toronto.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmadras2018predict,\ntitle={Predict Responsibly: Increasing Fairness by Learning to Defer},\nauthor={David Madras and Toniann Pitassi and Richard Zemel},\nyear={2018},\nurl={https://openreview.net/forum?id=SJUX_MWCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJUX_MWCZ", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;3;3", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844387, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1751996927908217686&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "SJVHY9lCb", "title": "Learning to Select: Problem, Solution, and Applications", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a \"Learning to Select\" problem that selects the best among the flexible size candidates. This makes decisions based not only on the properties of the candidate, but also on the environment in which they belong to. For example, job dispatching in the manufacturing factory is a typical \"Learning to Select\" problem. We propose Variable-Length CNN which combines the classification power using hidden features from CNN and the idea of flexible input from Learning to Rank algorithms. This not only can handles flexible candidates using Dynamic Computation Graph, but also is computationally efficient because it only builds a network with the necessary sizes to fit the situation. We applied the algorithm to the job dispatching problem which uses the dispatching log data obtained from the virtual fine-tuned factory. Our proposed algorithm shows considerably better performance than other comparable algorithms.", "keywords": "Selection Problem;Job Dispatching;Convolution Neural Network", "primary_area": "", "supplementary_material": "", "author": "Heechang Ryu;Donghyun Kim;Hayong Shin", "authorids": "rhc93@kaist.ac.kr;dhk618@kaist.ac.kr;hyshin@kaist.ac.kr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nryu2018learning,\ntitle={Learning to Select: Problem, Solution, and Applications},\nauthor={Heechang Ryu and Donghyun Kim and Hayong Shin},\nyear={2018},\nurl={https://openreview.net/forum?id=SJVHY9lCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJVHY9lCb", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;5;4", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:opO4YsDq9owJ:scholar.google.com/&scioq=Learning+to+Select:+Problem,+Solution,+and+Applications&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SJZ2Mf-0-", "title": "Adaptive Memory Networks", "track": "main", "status": "Workshop", "tldr": "Memory networks with faster inference", "abstract": "Real-world Question Answering (QA) tasks consist of thousands of words that often represent many facts and entities. Existing models based on LSTMs require a large number of parameters to support external memory and do not generalize well for long sequence inputs. Memory networks attempt to address these limitations by storing information to an external memory module but must examine all inputs in the memory. Hence, for longer sequence inputs the intermediate memory components proportionally scale in size resulting in poor inference times and high computation costs.\n\nIn this paper, we present Adaptive Memory Networks (AMN) that process input question pairs to dynamically construct a network architecture optimized for lower inference times. During inference, AMN parses input text into entities within different memory slots. However, distinct from previous approaches, AMN is a dynamic network architecture that creates variable numbers of memory banks weighted by question relevance. Thus, the decoder can select a variable number of memory banks to construct an answer using fewer banks, creating a runtime trade-off between accuracy and speed. \n\nAMN is enabled by first, a novel bank controller that makes discrete decisions with high accuracy and second, the capabilities of a dynamic framework (such as PyTorch) that allow for dynamic network sizing and efficient variable mini-batching. In our results, we demonstrate that our model learns to construct a varying number of memory banks based on task complexity and achieves faster inference times for standard bAbI tasks, and modified bAbI tasks. We achieve state of the art accuracy over these tasks with an average 48% lower entities are examined during inference.", "keywords": "Memory Networks;Dynamic Networks;Faster Inference;Reasoning;QA", "primary_area": "", "supplementary_material": "", "author": "Daniel Li;Asim Kadav", "authorids": "li.daniel@berkeley.edu;asim@nec-labs.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nli2018adaptive,\ntitle={Adaptive Memory Networks},\nauthor={Daniel Li and Asim Kadav},\nyear={2018},\nurl={https://openreview.net/forum?id=SJZ2Mf-0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=SJZ2Mf-0-", "pdf_size": 0, "rating": "4;5;7", "confidence": "3;4;5", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 2, "corr_rating_confidence": 0.9819805060619659, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "SJZsR7kCZ", "title": "Iterative Deep Compression : Compressing Deep Networks for Classification and Semantic Segmentation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Machine learning and in particular deep learning approaches have outperformed many traditional techniques in accomplishing complex tasks such as\nimage classfication, natural language processing or speech recognition. Most of the state-of-the art deep networks have complex architecture and use a vast number of parameters to reach this superior performance. Though these networks use a large number of learnable parameters, those parameters present significant redundancy. Therefore, it is possible to compress the network without much affecting its accuracy by eliminating those redundant and unimportant parameters.\nIn this work, we propose a three stage compression pipeline, which consists of pruning, weight sharing and quantization to compress deep neural networks.\nOur novel pruning technique combines magnitude based ones with dense sparse dense ideas and iteratively finds for each layer its achievable sparsity instead of selecting a single threshold for the whole network.\nUnlike previous works, where compression is only applied on networks performing classification, we evaluate and perform compression on networks for classification as well as semantic segmentation, which is greatly useful for understanding scenes in autonomous driving.\nWe tested our method on LeNet-5 and FCNs, performing classification and semantic segmentation, respectively. With LeNet-5 on MNIST, pruning reduces the number of parameters by 15.3 times and storage requirement from 1.7 MB to 0.006 MB with accuracy loss of 0.03%. With FCN8 on Cityscapes, we decrease the number of parameters by 8 times and reduce the storage requirement from 537.47 MB to 18.23 MB with class-wise intersection-over-union (IoU) loss of 4.93% on the validation data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sugandha Doda;Vitor Fortes Rey;Dr. Nadereh Hatami;Prof. Dr. Paul Lukowicz", "authorids": "sugandhadoda672@gmail.com;vitor.fortes@dfki.uni-kl.de;;", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ndoda2018iterative,\ntitle={Iterative Deep Compression : Compressing Deep Networks for Classification and Semantic Segmentation},\nauthor={Sugandha Doda and Vitor Fortes Rey and Dr. Nadereh Hatami and Prof. Dr. Paul Lukowicz},\nyear={2018},\nurl={https://openreview.net/forum?id=SJZsR7kCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SJZsR7kCZ", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6203029788080797996&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SJa1Nk10b", "title": "Anytime Neural Network: a Versatile Trade-off Between Computation and Accuracy", "track": "main", "status": "Reject", "tldr": "By focusing more on the final predictions in anytime predictors (such as the very recent Multi-Scale-DenseNets), we make small anytime models to outperform large ones that don't have such focus. ", "abstract": "We present an approach for anytime predictions in deep neural networks (DNNs). For each test sample, an anytime predictor produces a coarse result quickly, and then continues to refine it until the test-time computational budget is depleted. Such predictors can address the growing computational problem of DNNs by automatically adjusting to varying test-time budgets. In this work, we study a \\emph{general} augmentation to feed-forward networks to form anytime neural networks (ANNs) via auxiliary predictions and losses. Specifically, we point out a blind-spot in recent studies in such ANNs: the importance of high final accuracy. In fact, we show on multiple recognition data-sets and architectures that by having near-optimal final predictions in small anytime models, we can effectively double the speed of large ones to reach corresponding accuracy level. We achieve such speed-up with simple weighting of anytime losses that oscillate during training. We also assemble a sequence of exponentially deepening ANNs, to achieve both theoretically and practically near-optimal anytime results at any budget, at the cost of a constant fraction of additional consumed budget.", "keywords": "anytime;neural network;adaptive prediction;budgeted prediction", "primary_area": "", "supplementary_material": "", "author": "Hanzhang Hu;Debadeepta Dey;Martial Hebert;J. Andrew Bagnell", "authorids": "hanzhang@cs.cmu.edu;dedey@microsoft.com;hebert@ri.cmu.edu;dbagnell@ri.cmu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nhu2018anytime,\ntitle={Anytime Neural Network: a Versatile Trade-off Between Computation and Accuracy},\nauthor={Hanzhang Hu and Debadeepta Dey and Martial Hebert and J. Andrew Bagnell},\nyear={2018},\nurl={https://openreview.net/forum?id=SJa1Nk10b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJa1Nk10b", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;3;2", "rating_avg": 5.666666666666667, "confidence_avg": 3.0, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3783990122365916683&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "title": "Residual Connections Encourage Iterative Inference", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/267", "id": "SJa9iHgAZ", "author_site": "Stanislaw Jastrzebski, Devansh Arpit, Nicolas Ballas, Vikas Verma, Tong Che, Yoshua Bengio", "tldr": "Residual connections really perform iterative inference", "abstract": "Residual networks (Resnets) have become a prominent architecture in deep learning. However, a comprehensive understanding of Resnets is still a topic of ongoing research. A recent view argues that Resnets perform iterative refinement of features. We attempt to further expose properties of this aspect. To this end, we study Resnets both analytically and empirically. We formalize the notion of iterative refinement in Resnets by showing that residual architectures naturally encourage features to move along the negative gradient of loss during the feedforward phase. In addition, our empirical analysis suggests that Resnets are able to perform both representation learning and iterative refinement. In general, a Resnet block tends to concentrate representation learning behavior in the first few layers while higher layers perform iterative refinement of features. Finally we observe that sharing residual layers naively leads to representation explosion and hurts generalization performance, and show that simple existing strategies can help alleviating this problem.", "keywords": "residual network;iterative inference;deep learning", "primary_area": "", "supplementary_material": "", "author": "Stanis\u0142aw Jastrzebski;Devansh Arpit;Nicolas Ballas;Vikas Verma;Tong Che;Yoshua Bengio", "authorids": "staszek.jastrzebski@gmail.com;devansharpit@gmail.com;ballas.n@gmail.com;vikasverma.iitm@gmail.com;tongcheprivate@gmail.com;yoshua.umontreal@gmail.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\njastrzebski2018residual,\ntitle={Residual Connections Encourage Iterative Inference},\nauthor={Stanis\u0142aw Jastrzebski and Devansh Arpit and Nicolas Ballas and Vikas Verma and Tong Che and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SJa9iHgAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "5;6;7", "confidence": "5;3;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 6, "corr_rating_confidence": -0.5, "gs_citation": 174, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14453729681594903284&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=SJa9iHgAZ", "pdf": "https://openreview.net/pdf?id=SJa9iHgAZ", "email": ";;;;;", "author_num": 6 }, { "title": "Deep Learning with Logged Bandit Feedback", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/282", "id": "SJaP_-xAb", "author_site": "Thorsten Joachims, Adith Swaminathan, Maarten de Rijke", "tldr": "The paper proposes a new output layer for deep networks that permits the use of logged contextual bandit feedback for training. ", "abstract": "We propose a new output layer for deep neural networks that permits the use of logged contextual bandit feedback for training. Such contextual bandit feedback can be available in huge quantities (e.g., logs of search engines, recommender systems) at little cost, opening up a path for training deep networks on orders of magnitude more data. To this effect, we propose a Counterfactual Risk Minimization (CRM) approach for training deep networks using an equivariant empirical risk estimator with variance regularization, BanditNet, and show how the resulting objective can be decomposed in a way that allows Stochastic Gradient Descent (SGD) training. We empirically demonstrate the effectiveness of the method by showing how deep networks -- ResNets in particular -- can be trained for object recognition without conventionally labeled images. ", "keywords": "Batch Learning from Bandit Feedback;Counterfactual Learning", "primary_area": "", "supplementary_material": "", "author": "Thorsten Joachims;Adith Swaminathan;Maarten de Rijke", "authorids": "tj@cs.cornell.edu;adswamin@microsoft.com;derijke@uva.nl", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\njoachims2018deep,\ntitle={Deep Learning with Logged Bandit Feedback},\nauthor={Thorsten Joachims and Adith Swaminathan and Maarten de Rijke},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SJaP_-xAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;4;3", "rating_avg": 7.0, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 158, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8863477731568200162&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=SJaP_-xAb", "pdf": "https://openreview.net/pdf?id=SJaP_-xAb", "email": ";;", "author_num": 3 }, { "id": "SJahqJZAW", "title": "Stabilizing GAN Training with Multiple Random Projections", "track": "main", "status": "Reject", "tldr": "Stable GAN training in high dimensions by using an array of discriminators, each with a low dimensional view of generated samples", "abstract": "Training generative adversarial networks is unstable in high-dimensions as the true data distribution tends to be concentrated in a small fraction of the ambient space. The discriminator is then quickly able to classify nearly all generated samples as fake, leaving the generator without meaningful gradients and causing it to deteriorate after a point in training. In this work, we propose training a single generator simultaneously against an array of discriminators, each of which looks at a different random low-dimensional projection of the data. Individual discriminators, now provided with restricted views of the input, are unable to reject generated samples perfectly and continue to provide meaningful gradients to the generator throughout training. Meanwhile, the generator learns to produce samples consistent with the full data distribution to satisfy all discriminators simultaneously. We demonstrate the practical utility of this approach experimentally, and show that it is able to produce image samples with higher quality than traditional training with a single discriminator.", "keywords": "generative adversarial networks;stable training;low-dimensional projections;deep learning", "primary_area": "", "supplementary_material": "", "author": "Behnam Neyshabur;Srinadh Bhojanapalli;Ayan Chakrabarti", "authorids": "bneyshabur@ttic.edu;srinadh@ttic.edu;ayan@wustl.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nneyshabur2018stabilizing,\ntitle={Stabilizing {GAN} Training with Multiple Random Projections},\nauthor={Behnam Neyshabur and Srinadh Bhojanapalli and Ayan Chakrabarti},\nyear={2018},\nurl={https://openreview.net/forum?id=SJahqJZAW},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=SJahqJZAW)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SJahqJZAW", "pdf_size": 0, "rating": "3;5;8", "confidence": "5;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": -0.8029550685469661, "gs_citation": 107, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4288886064123466035&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Can recurrent neural networks warp time?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/181", "id": "SJcKhk-Ab", "author_site": "Corentin Tallec, Yann Ollivier", "tldr": "Proves that gating mechanisms provide invariance to time transformations. Introduces and tests a new initialization for LSTMs from this insight.", "abstract": "Successful recurrent models such as long short-term memories (LSTMs) and gated recurrent units (GRUs) use \\emph{ad hoc} gating mechanisms. Empirically these models have been found to improve the learning of medium to long term temporal dependencies and to help with vanishing gradient issues.\n\t\nWe prove that learnable gates in a recurrent model formally provide \\emph{quasi-invariance to general time transformations} in the input data. We recover part of the LSTM architecture from a simple axiomatic approach.\n\t\nThis result leads to a new way of initializing gate biases in LSTMs and GRUs. Experimentally, this new \\emph{chrono initialization} is shown to greatly improve learning of long term dependencies, with minimal implementation effort.\n\n", "keywords": "RNN", "primary_area": "", "supplementary_material": "", "author": "Corentin Tallec;Yann Ollivier", "authorids": "corentin.tallec@polytechnique.edu;yol@fb.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\ntallec2018can,\ntitle={Can recurrent neural networks warp time?},\nauthor={Corentin Tallec and Yann Ollivier},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SJcKhk-Ab},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "8;8;8", "confidence": "4;4;4", "rating_avg": 8.0, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 189, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17268685300536863187&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=SJcKhk-Ab", "pdf": "https://openreview.net/pdf?id=SJcKhk-Ab", "email": ";", "author_num": 2 }, { "id": "SJd0EAy0b", "title": "Generalized Graph Embedding Models", "track": "main", "status": "Reject", "tldr": "Generalized Graph Embedding Models", "abstract": "Many types of relations in physical, biological, social and information systems can be modeled as homogeneous or heterogeneous concept graphs. Hence, learning from and with graph embeddings has drawn a great deal of research interest recently, but only ad hoc solutions have been obtained this far. In this paper, we conjecture that the one-shot supervised learning mechanism is a bottleneck in improving the performance of the graph embedding learning algorithms, and propose to extend this by introducing a multi-shot unsupervised learning framework. Empirical results on several real-world data set show that the proposed model consistently and significantly outperforms existing state-of-the-art approaches on knowledge base completion and graph based multi-label classification tasks.", "keywords": "representation learning;knowledge graphs;relational inference;link prediction;multi-label classification;knowledge base completion", "primary_area": "", "supplementary_material": "", "author": "Qiao Liu;Xiaohui Yang;Rui Wan;Shouzhong Tu;Zufeng Wu", "authorids": "qliu@uestc.edu.cn;yangxhui@uestc.std.edu.cn;rwan@uestc.std.edu.cn;tusz11@mails.tsinghua.edu.cn;wuzufeng@uestc.edu.cn", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nliu2018generalized,\ntitle={Generalized Graph Embedding Models},\nauthor={Qiao Liu and Xiaohui Yang and Rui Wan and Shouzhong Tu and Zufeng Wu},\nyear={2018},\nurl={https://openreview.net/forum?id=SJd0EAy0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJd0EAy0b", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "SJdCUMZAW", "title": "Data-efficient Deep Reinforcement Learning for Dexterous Manipulation", "track": "main", "status": "Reject", "tldr": "Data-efficient deep reinforcement learning can be used to learning precise stacking policies.", "abstract": "Grasping an object and precisely stacking it on another is a difficult task for traditional robotic control or hand-engineered approaches. Here we examine the problem in simulation and provide techniques aimed at solving it via deep reinforcement learning. We introduce two straightforward extensions to the Deep Deterministic Policy Gradient algorithm (DDPG), which make it significantly more data-efficient and scalable. Our results show that by making extensive use of off-policy data and replay, it is possible to find high-performance control policies. Further, our results hint that it may soon be feasible to train successful stacking policies by collecting interactions on real robots.", "keywords": "Reinforcement learning;robotics;dexterous manipulation;off-policy learning", "primary_area": "", "supplementary_material": "", "author": "Ivo Popov;Nicolas Heess;Timothy P. Lillicrap;Roland Hafner;Gabriel Barth-Maron;Matej Vecerik;Thomas Lampe;Tom Erez;Yuval Tassa;Martin Riedmiller", "authorids": "ivaylo.popov@hotmail.com;heess@google.com;countzero@google.com;rhafner@google.com;gabrielbm@google.com;matejvecerik@google.com;thomaslampe@google.com;etom@google.com;tassa@google.com;riedmiller@google.com", "gender": ";;;;;;;;;", "homepage": ";;;;;;;;;", "dblp": ";;;;;;;;;", "google_scholar": ";;;;;;;;;", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;;;", "or_profile": ";;;;;;;;;", "aff": ";;;;;;;;;", "aff_domain": ";;;;;;;;;", "position": ";;;;;;;;;", "bibtex": "@misc{\npopov2018dataefficient,\ntitle={Data-efficient Deep Reinforcement Learning for Dexterous Manipulation},\nauthor={Ivo Popov and Nicolas Heess and Timothy P. Lillicrap and Roland Hafner and Gabriel Barth-Maron and Matej Vecerik and Thomas Lampe and Tom Erez and Yuval Tassa and Martin Riedmiller},\nyear={2018},\nurl={https://openreview.net/forum?id=SJdCUMZAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SJdCUMZAW", "pdf_size": 0, "rating": "2;3;4", "confidence": "5;4;4", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 10, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 348, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1143854187206916398&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "SJgf6Z-0W", "title": "Predicting Multiple Actions for Stochastic Continuous Control", "track": "main", "status": "Reject", "tldr": "We introduce a novel reinforcement learning algorithm, that predicts multiple actions and samples from them.", "abstract": "We introduce a new approach to estimate continuous actions using actor-critic algorithms for reinforcement learning problems. Policy gradient methods usually predict one continuous action estimate or parameters of a presumed distribution (most commonly Gaussian) for any given state which might not be optimal as it may not capture the complete description of the target distribution. Our approach instead predicts M actions with the policy network (actor) and then uniformly sample one action during training as well as testing at each state. This allows the agent to learn a simple stochastic policy that has an easy to compute expected return. In all experiments, this facilitates better exploration of the state space during training and converges to a better policy. ", "keywords": "Reinforcement Learning;DDPG;Multiple Action Prediction", "primary_area": "", "supplementary_material": "", "author": "Sanjeev Kumar;Christian Rupprecht;Federico Tombari;Gregory D. Hager", "authorids": "sanjeev.kumar@in.tum.de;christian.rupprecht@in.tum.de;tombari@in.tum.de;hager@cs.tum.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nkumar2018predicting,\ntitle={Predicting Multiple Actions for Stochastic Continuous Control},\nauthor={Sanjeev Kumar and Christian Rupprecht and Federico Tombari and Gregory D. Hager},\nyear={2018},\nurl={https://openreview.net/forum?id=SJgf6Z-0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJgf6Z-0W", "pdf_size": 0, "rating": "3;4;7", "confidence": "4;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": -0.9707253433941508, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16407644121311819480&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Gradient Estimators for Implicit Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/252", "id": "SJi9WOeRb", "author_site": "Yingzhen Li, Richard E Turner", "tldr": "We introduced a novel gradient estimator using Stein's method, and compared with other methods on learning implicit models for approximate inference and image generation.", "abstract": "Implicit models, which allow for the generation of samples but not for point-wise evaluation of probabilities, are omnipresent in real-world problems tackled by machine learning and a hot topic of current research. Some examples include data simulators that are widely used in engineering and scientific research, generative adversarial networks (GANs) for image synthesis, and hot-off-the-press approximate inference techniques relying on implicit distributions. The majority of existing approaches to learning implicit models rely on approximating the intractable distribution or optimisation objective for gradient-based optimisation, which is liable to produce inaccurate updates and thus poor models. This paper alleviates the need for such approximations by proposing the \\emph{Stein gradient estimator}, which directly estimates the score function of the implicitly defined distribution. The efficacy of the proposed estimator is empirically demonstrated by examples that include meta-learning for approximate inference and entropy regularised GANs that provide improved sample diversity.", "keywords": "Implicit Models;Approximate Inference;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Yingzhen Li;Richard E. Turner", "authorids": "yl494@cam.ac.uk;ret26@cam.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nli2018gradient,\ntitle={Gradient Estimators for Implicit Models},\nauthor={Yingzhen Li and Richard E. Turner},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SJi9WOeRb},\n}", "github": "[![github](/images/github_icon.svg) YingzhenLi/SteinGrad](https://github.com/YingzhenLi/SteinGrad)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "2;4;2", "rating_avg": 6.666666666666667, "confidence_avg": 2.6666666666666665, "replies_avg": 15, "authors#_avg": 2, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 123, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=29993418784277680&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=SJi9WOeRb", "pdf": "https://openreview.net/pdf?id=SJi9WOeRb", "email": ";", "author_num": 2 }, { "id": "SJiHOSeR-", "title": "Contextual memory bandit for pro-active dialog engagement", "track": "main", "status": "Reject", "tldr": "", "abstract": "An objective of pro-activity in dialog systems is to enhance the usability of conversational\nagents by enabling them to initiate conversation on their own. While\ndialog systems have become increasingly popular during the last couple of years,\ncurrent task oriented dialog systems are still mainly reactive and users tend to\ninitiate conversations. In this paper, we propose to introduce the paradigm of contextual\nbandits as framework for pro-active dialog systems. Contextual bandits\nhave been the model of choice for the problem of reward maximization with partial\nfeedback since they fit well to the task description. As a second contribution,\nwe introduce and explore the notion of memory into this paradigm. We propose\ntwo differentiable memory models that act as parts of the parametric reward estimation\nfunction. The first one, Convolutional Selective Memory Networks, uses\na selection of past interactions as part of the decision support. The second model,\ncalled Contextual Attentive Memory Network, implements a differentiable attention\nmechanism over the past interactions of the agent. The goal is to generalize\nthe classic model of contextual bandits to settings where temporal information\nneeds to be incorporated and leveraged in a learnable manner. Finally, we illustrate\nthe usability and performance of our model for building a pro-active mobile\nassistant through an extensive set of experiments.", "keywords": "contextual bandit;memory network;proactive dialog engagement", "primary_area": "", "supplementary_material": "", "author": "julien perez;Tomi Silander", "authorids": "julien.perez@naverlabs.com;julien.perez@naverlabs.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nperez2018contextual,\ntitle={Contextual memory bandit for pro-active dialog engagement},\nauthor={julien perez and Tomi Silander},\nyear={2018},\nurl={https://openreview.net/forum?id=SJiHOSeR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SJiHOSeR-", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8649009603984149443&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Diffusion Convolutional Recurrent Neural Network: Data-Driven Traffic Forecasting", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/80", "id": "SJiHXGWAZ", "author_site": "Yaguang Li, Rose Yu, Cyrus Shahabi, Yan Liu", "tldr": "A neural sequence model that learns to forecast on a directed graph.", "abstract": "Spatiotemporal forecasting has various applications in neuroscience, climate and transportation domain. Traffic forecasting is one canonical example of such learning task. The task is challenging due to (1) complex spatial dependency on road networks, (2) non-linear temporal dynamics with changing road conditions and (3) inherent difficulty of long-term forecasting. To address these challenges, we propose to model the traffic flow as a diffusion process on a directed graph and introduce Diffusion Convolutional Recurrent Neural Network (DCRNN), a deep learning framework for traffic forecasting that incorporates both spatial and temporal dependency in the traffic flow. Specifically, DCRNN captures the spatial dependency using bidirectional random walks on the graph, and the temporal dependency using the encoder-decoder architecture with scheduled sampling. We evaluate the framework on two real-world large-scale road network traffic datasets and observe consistent improvement of 12% - 15% over state-of-the-art baselines", "keywords": "Traffic prediction;spatiotemporal forecasting;diffusion;graph convolution;random walk;long-term forecasting", "primary_area": "", "supplementary_material": "", "author": "Yaguang Li;Rose Yu;Cyrus Shahabi;Yan Liu", "authorids": "yaguang@usc.edu;rose@caltech.edu;shahabi@usc.edu;yanliu.cs@usc.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nli2018diffusion,\ntitle={Diffusion Convolutional Recurrent Neural Network: Data-Driven Traffic Forecasting},\nauthor={Yaguang Li and Rose Yu and Cyrus Shahabi and Yan Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SJiHXGWAZ},\n}", "github": "[![github](/images/github_icon.svg) liyaguang/DCRNN](https://github.com/liyaguang/DCRNN) + [![Papers with Code](/images/pwc_icon.svg) 15 community implementations](https://paperswithcode.com/paper/?openreview=SJiHXGWAZ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "4;5;9", "confidence": "5;3;5", "rating_avg": 6.0, "confidence_avg": 4.333333333333333, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": 0.3273268353539885, "gs_citation": 4697, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6301301566407555232&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=SJiHXGWAZ", "pdf": "https://openreview.net/pdf?id=SJiHXGWAZ", "email": ";;;", "author_num": 4 }, { "id": "SJjADecmf", "title": "Detecting Anomalies in Communication Packet Streams based on Generative Adversarial Networks", "track": "main", "status": "Withdraw", "tldr": "semi-supervised and transfer learning on packet flow classification, via a system of cooperative or adversarial neural blocks", "abstract": "The fault diagnosis in a modern communication system is traditionally supposed to be difficult, or even impractical for a purely data-driven machine learning approach, for it is a humanmade system of intensive knowledge. A few labeled raw packet streams extracted from fault archive can hardly be sufficient to deduce the intricate logic of underlying protocols. In this paper, we supplement these limited samples with two inexhaustible data sources: the unlabeled records probed from a system in service, and the labeled data simulated in an emulation environment. To transfer their inherent knowledge to the target domain, we construct a directed information flow graph, whose nodes are neural network components consisting of two generators, three discriminators and one classifier, and whose every forward path represents a pair of adversarial optimization goals, in accord with the semi-supervised and transfer learning demands. The multi-headed network can be trained in an alternative approach, at each iteration of which we select one target to update the weights along the path upstream, and refresh the residual layer-wisely to all outputs downstream. The actual results show that it can achieve comparable accuracy on classifying Transmission Control Protocol (TCP) streams without deliberate expert features. The solution has relieved operation engineers from massive works of understanding and maintaining rules, and provided a quick solution independent of specific protocols.", "keywords": "Anomaly Detection;Fault diagnosis;Generative Adversarial Networks;Network Operation;TCP/IP", "primary_area": "", "supplementary_material": "", "author": "Anonymous", "authorids": "ICLR.cc/2018/Conference/Paper280/Authors", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@article{\n anonymous2018detecting,\n title={Detecting Anomalies in Communication Packet Streams based on Generative Adversarial Networks},\n author={Anonymous},\n journal={International Conference on Learning Representations},\n year={2018},\n url={https://openreview.net/forum?id=rkKsaLlR-}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=SJjADecmf", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;3;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 1, "corr_rating_confidence": -0.5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14863535241656773176&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "SJky6Ry0W", "title": "Learning Independent Causal Mechanisms", "track": "main", "status": "Reject", "tldr": "", "abstract": "Independent causal mechanisms are a central concept in the study of causality\nwith implications for machine learning tasks. In this work we develop\nan algorithm to recover a set of (inverse) independent mechanisms relating\na distribution transformed by the mechanisms to a reference distribution.\nThe approach is fully unsupervised and based on a set of experts that compete\nfor data to specialize and extract the mechanisms. We test and analyze\nthe proposed method on a series of experiments based on image transformations.\nEach expert successfully maps a subset of the transformed data\nto the original domain, and the learned mechanisms generalize to other\ndomains. We discuss implications for domain transfer and links to recent\ntrends in generative modeling.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Giambattista Parascandolo;Mateo Rojas Carulla;Niki Kilbertus;Bernhard Schoelkopf", "authorids": "gparascandolo@tue.mpg.de;mrojascarulla@gmail.com;nkilbertus@tue.mpg.de;bs@tue.mpg.de", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nparascandolo2018learning,\ntitle={Learning Independent Causal Mechanisms},\nauthor={Giambattista Parascandolo and Mateo Rojas Carulla and Niki Kilbertus and Bernhard Schoelkopf},\nyear={2018},\nurl={https://openreview.net/forum?id=SJky6Ry0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJky6Ry0W", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 204, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1033682372481921023&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "SJlhPMWAW", "title": "GraphVAE: Towards Generation of Small Graphs Using Variational Autoencoders", "track": "main", "status": "Reject", "tldr": "We demonstate an autoencoder for graphs.", "abstract": "Deep learning on graphs has become a popular research topic with many applications. However, past work has concentrated on learning graph embedding tasks only, which is in contrast with advances in generative models for images and text. Is it possible to transfer this progress to the domain of graphs? We propose to sidestep hurdles associated with linearization of such discrete structures by having a decoder output a probabilistic fully-connected graph of a predefined maximum size directly at once. Our method is formulated as a variational autoencoder. We evaluate on the challenging task of conditional molecule generation. ", "keywords": "graph;generative model;autoencoder", "primary_area": "", "supplementary_material": "", "author": "Martin Simonovsky;Nikos Komodakis", "authorids": "simonovm@imagine.enpc.fr;nikos.komodakis@enpc.fr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nsimonovsky2018graphvae,\ntitle={Graph{VAE}: Towards Generation of Small Graphs Using Variational Autoencoders},\nauthor={Martin Simonovsky and Nikos Komodakis},\nyear={2018},\nurl={https://openreview.net/forum?id=SJlhPMWAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SJlhPMWAW", "pdf_size": 0, "rating": "5;7;7", "confidence": "3;4;2", "rating_avg": 6.333333333333333, "confidence_avg": 3.0, "replies_avg": 17, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 1110, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9031357146703402647&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11 }, { "id": "SJmAXkgCb", "title": "DNN Feature Map Compression using Learned Representation over GF(2)", "track": "main", "status": "Reject", "tldr": "Feature map compression method that converts quantized activations into binary vectors followed by nonlinear dimensionality reduction layers embedded into a DNN", "abstract": "In this paper, we introduce a method to compress intermediate feature maps of deep neural networks (DNNs) to decrease memory storage and bandwidth requirements during inference. Unlike previous works, the proposed method is based on converting fixed-point activations into vectors over the smallest GF(2) finite field followed by nonlinear dimensionality reduction (NDR) layers embedded into a DNN. Such an end-to-end learned representation finds more compact feature maps by exploiting quantization redundancies within the fixed-point activations along the channel or spatial dimensions. We apply the proposed network architecture to the tasks of ImageNet classification and PASCAL VOC object detection. Compared to prior approaches, the conducted experiments show a factor of 2 decrease in memory requirements with minor degradation in accuracy while adding only bitwise computations.", "keywords": "feature map;representation;compression;quantization;finite-field", "primary_area": "", "supplementary_material": "", "author": "Denis A. Gudovskiy;Alec Hodgkinson;Luca Rigazio", "authorids": "denis.gudovskiy@us.panasonic.com;alec.hodgkinson@us.panasonic.com;luca.rigazio@us.panasonic.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\na.2018dnn,\ntitle={{DNN} Feature Map Compression using Learned Representation over {GF}(2)},\nauthor={Denis A. Gudovskiy and Alec Hodgkinson and Luca Rigazio},\nyear={2018},\nurl={https://openreview.net/forum?id=SJmAXkgCb},\n}", "github": "[![github](/images/github_icon.svg) gudovskiy/fmap_compression](https://github.com/gudovskiy/fmap_compression)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SJmAXkgCb", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": -0.944911182523068, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10100727653390426513&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10 }, { "id": "SJme6-ZR-", "title": "A Deep Learning Approach for Survival Clustering without End-of-life Signals", "track": "main", "status": "Reject", "tldr": "The goal of survival clustering is to map subjects into clusters. Without end-of-life signals, this is a challenging task. To address this task we propose a new loss function by modifying the Kuiper statistics.", "abstract": "The goal of survival clustering is to map subjects (e.g., users in a social network, patients in a medical study) to $K$ clusters ranging from low-risk to high-risk. Existing survival methods assume the presence of clear \\textit{end-of-life} signals or introduce them artificially using a pre-defined timeout. In this paper, we forego this assumption and introduce a loss function that differentiates between the empirical lifetime distributions of the clusters using a modified Kuiper statistic. We learn a deep neural network by optimizing this loss, that performs a soft clustering of users into survival groups. We apply our method to a social network dataset with over 1M subjects, and show significant improvement in C-index compared to alternatives.", "keywords": "Survival Analysis;Kuiper statistics;model-free", "primary_area": "", "supplementary_material": "", "author": "S Chandra Mouli;Bruno Ribeiro;Jennifer Neville", "authorids": "chandr@purdue.edu;ribeiro@cs.purdue.edu;neville@cs.purdue.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nchandra2018a,\ntitle={A Deep Learning Approach for Survival Clustering without End-of-life Signals},\nauthor={S Chandra Mouli and Bruno Ribeiro and Jennifer Neville},\nyear={2018},\nurl={https://openreview.net/forum?id=SJme6-ZR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJme6-ZR-", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;1;5", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": -0.2773500981126144, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lbJ5DEVoiBwJ:scholar.google.com/&scioq=A+Deep+Learning+Approach+for+Survival+Clustering+without+End-of-life+Signals&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SJn0sLgRb", "title": "Data Augmentation by Pairing Samples for Images Classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "Data augmentation is a widely used technique in many machine learning tasks, such as image classification, to virtually enlarge the training dataset size and avoid overfitting. Traditional data augmentation techniques for image classification tasks create new samples from the original training data by, for example, flipping, distorting, adding a small amount of noise to, or cropping a patch from an original image. In this paper, we introduce a simple but surprisingly effective data augmentation technique for image classification tasks. With our technique, named SamplePairing, we synthesize a new sample from one image by overlaying another image randomly chosen from the training data (i.e., taking an average of two images for each pixel). By using two images randomly selected from the training set, we can generate N^2 new samples from N training samples. This simple data augmentation technique significantly improved classification accuracy for all the tested datasets; for example, the top-1 error rate was reduced from 33.5% to 29.0% for the ILSVRC 2012 dataset with GoogLeNet and from 8.22% to 6.93% in the CIFAR-10 dataset. We also show that our SamplePairing technique largely improved accuracy when the number of samples in the training set was very small. Therefore, our technique is more valuable for tasks with a limited amount of training data, such as medical imaging tasks.\n", "keywords": "Data augmentation;Image classification", "primary_area": "", "supplementary_material": "", "author": "Hiroshi Inoue", "authorids": "inouehrs@jp.ibm.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\ninoue2018data,\ntitle={Data Augmentation by Pairing Samples for Images Classification},\nauthor={Hiroshi Inoue},\nyear={2018},\nurl={https://openreview.net/forum?id=SJn0sLgRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJn0sLgRb", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;4;4", "rating_avg": 5.0, "confidence_avg": 4.333333333333333, "replies_avg": 25, "authors#_avg": 1, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 700, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3545339074007715786&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "SJsb_xTSM", "title": "Multitask learning of Multilingual Sentence Representations", "track": "main", "status": "Withdraw", "tldr": "We jointly train a multilingual skip-gram model and a cross-lingual sentence similarity model to learn high quality multilingual text embeddings that perform well in the low resource scenario.", "abstract": "We present a novel multi-task training approach to learning multilingual distributed representations of text. Our system learns word and sentence embeddings jointly by training a multilingual skip-gram model together with a cross-lingual sentence similarity model. We construct sentence embeddings by processing word embeddings with an LSTM and by taking an average of the outputs. Our architecture can transparently use both monolingual and sentence aligned bilingual corpora to learn multilingual embeddings, thus covering a vocabulary significantly larger than the vocabulary of the bilingual corpora alone. Our model shows competitive performance in a standard cross-lingual document classification task. We also show the effectiveness of our method in a low-resource scenario.", "keywords": "multilingual;embedding;representation learning;multi-task learning;low resource", "primary_area": "", "supplementary_material": "", "author": "Karan Singla;Dogan Can;Shrikanth Narayanan", "authorids": "singlak@usc.edu;dogancan@usc.edu;shri@ee.usc.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=SJsb_xTSM", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 0, "authors#_avg": 3, "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:m_RL-VwvJK0J:scholar.google.com/&scioq=Multitask+learning+of+Multilingual+Sentence+Representations&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SJtChcgAW", "title": "Cheap DNN Pruning with Performance Guarantees", "track": "main", "status": "Reject", "tldr": "A fast pruning algorithm for fully connected DNN layers with theoretical analysis of degradation in Generalisation Error.", "abstract": "Recent DNN pruning algorithms have succeeded in reducing the number of parameters in fully connected layers often with little or no drop in classification accuracy. However most of the existing pruning schemes either have to be applied during training or require a costly retraining procedure after pruning to regain classification accuracy. In this paper we propose a cheap pruning algorithm based on difference of convex (DC) optimisation. We also provide theoretical analysis for the growth in the Generalisation Error (GE) of the new pruned network. Our method can be used with any convex regulariser and allows for a controlled degradation in classification accuracy while being orders of magnitude faster than competing approaches. Experiments on common feedforward neural networks show that for sparsity levels above 90% our method achieves 10% higher classification accuracy compared to Hard Thresholding.", "keywords": "pruning;generalisation error;DC optimisation", "primary_area": "", "supplementary_material": "", "author": "Konstantinos Pitas;Mike Davies;Pierre Vandergheynst", "authorids": "konstantinos.pitas@epfl.ch;mike.davies@ed.ac.uk;pierre.vandergheynst@epfl.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\npitas2018cheap,\ntitle={Cheap {DNN} Pruning with Performance Guarantees },\nauthor={Konstantinos Pitas and Mike Davies and Pierre Vandergheynst},\nyear={2018},\nurl={https://openreview.net/forum?id=SJtChcgAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SJtChcgAW", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:L2wX39moNO0J:scholar.google.com/&scioq=Cheap+DNN+Pruning+with+Performance+Guarantees&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SJtfOEn6-", "title": "ResBinNet: Residual Binary Neural Network", "track": "main", "status": "Reject", "tldr": "Residual Binary Neural Networks significantly improve the convergence rate and inference accuracy of the binary neural networks.", "abstract": "Recent efforts on training light-weight binary neural networks offer promising execution/memory efficiency. This paper introduces ResBinNet, which is a composition of two interlinked methodologies aiming to address the slow convergence speed and limited accuracy of binary convolutional neural networks. The first method, called residual binarization, learns a multi-level binary representation for the features within a certain neural network layer. The second method, called temperature adjustment, gradually binarizes the weights of a particular layer. The two methods jointly learn a set of soft-binarized parameters that improve the convergence rate and accuracy of binary neural networks. We corroborate the applicability and scalability of ResBinNet by implementing a prototype hardware accelerator. The accelerator is reconfigurable in terms of the numerical precision of the binarized features, offering a trade-off between runtime and inference accuracy.\n", "keywords": "Binary Neural Networks;Residual Binarization;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Mohammad Ghasemzadeh;Mohammad Samragh;Farinaz Koushanfar", "authorids": "mghasemzadeh@ucsd.edu;msamragh@ucsd.edu;farinaz@ucsd.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nghasemzadeh2018resbinnet,\ntitle={ResBinNet: Residual Binary Neural Network},\nauthor={Mohammad Ghasemzadeh and Mohammad Samragh and Farinaz Koushanfar},\nyear={2018},\nurl={https://openreview.net/forum?id=SJtfOEn6-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SJtfOEn6-", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=973099274254665627&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "SJu63o10b", "title": "UNSUPERVISED METRIC LEARNING VIA NONLINEAR FEATURE SPACE TRANSFORMATIONS", "track": "main", "status": "Reject", "tldr": " a nonlinear unsupervised metric learning framework to boost the performance of clustering algorithms.", "abstract": "In this paper, we propose a nonlinear unsupervised metric learning framework to boost of the performance of clustering algorithms. Under our framework, nonlinear distance metric learning and manifold embedding are integrated and conducted simultaneously to increase the natural separations among data samples. The metric learning component is implemented through feature space transformations, regulated by a nonlinear deformable model called Coherent Point Drifting (CPD). Driven by CPD, data points can get to a higher level of linear separability, which is subsequently picked up by the manifold embedding component to generate well-separable sample projections for clustering. Experimental results on synthetic and benchmark datasets show the effectiveness of our proposed approach over the state-of-the-art solutions in unsupervised metric learning.\n", "keywords": "Metric Learning;K-means;CPD;Clustering", "primary_area": "", "supplementary_material": "", "author": "Pin Zhang;Bibo Shi;JundongLiu", "authorids": "pz335412@ohio.edu;bibo.shi@duke.edu;liuj1@ohio.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzhang2018unsupervised,\ntitle={{UNSUPERVISED} {METRIC} {LEARNING} {VIA} {NONLINEAR} {FEATURE} {SPACE} {TRANSFORMATIONS}},\nauthor={Pin Zhang and Bibo Shi and JundongLiu},\nyear={2018},\nurl={https://openreview.net/forum?id=SJu63o10b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJu63o10b", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;5;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EnfzXkMjHEwJ:scholar.google.com/&scioq=UNSUPERVISED+METRIC+LEARNING+VIA+NONLINEAR+FEATURE+SPACE+TRANSFORMATIONS&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SJvq-EcCZ", "title": "withdraw", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "withdraw", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Liyuan Liu;Jingbo Shang;Xiaotao Gu;Xiang Ren;Jian Peng;Jiawei Han", "authorids": "ll2@illinois.edu;shang7@illinois.edu;xiaotao2@illinois.du;xiangren@usc.edu;jianpeng@illinois.edu;hanj@illinois.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=SJvq-EcCZ", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 0, "authors#_avg": 6, "corr_rating_confidence": 0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "SJvrXqvaZ", "title": "Adversary A3C for Robust Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Asynchronous Advantage Actor Critic (A3C) is an effective Reinforcement Learning (RL) algorithm for a wide range of tasks, such as Atari games and robot control. The agent learns policies and value function through trial-and-error interactions with the environment until converging to an optimal policy. Robustness and stability are critical in RL; however, neural network can be vulnerable to noise from unexpected sources and is not likely to withstand very slight disturbances. We note that agents generated from mild environment using A3C are not able to handle challenging environments. Learning from adversarial examples, we proposed an algorithm called Adversary Robust A3C (AR-A3C) to improve the agent\u2019s performance under noisy environments. In this algorithm, an adversarial agent is introduced to the learning process to make it more robust against adversarial disturbances, thereby making it more adaptive to noisy environments. Both simulations and real-world experiments are carried out to illustrate the stability of the proposed algorithm. The AR-A3C algorithm outperforms A3C in both clean and noisy environments. ", "keywords": "Adversary;Robust;Reinforcement Learning;A3C", "primary_area": "", "supplementary_material": "", "author": "Zhaoyuan Gu;Zhenzhong Jia;Howie Choset", "authorids": "guzhaoyuan14@gmail.com;zhenzhong.jia@gmail.com;choset@cs.cmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ngu2018adversary,\ntitle={Adversary A3C for Robust Reinforcement Learning},\nauthor={Zhaoyuan Gu and Zhenzhong Jia and Howie Choset},\nyear={2018},\nurl={https://openreview.net/forum?id=SJvrXqvaZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer5;AnonReviewer2", "site": "https://openreview.net/forum?id=SJvrXqvaZ", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15409857827066149104&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SJvu-GW0b", "title": "Graph2Seq: Scalable Learning Dynamics for Graphs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural networks are increasingly used as a general purpose approach to learning algorithms over graph structured data. However, techniques for representing graphs as real-valued vectors are still in their infancy. Recent works have proposed several approaches (e.g., graph convolutional networks), but as we show in this paper, these methods have difficulty generalizing to large graphs. In this paper we propose Graph2Seq, an embedding framework that represents graphs as an infinite time-series. By not limiting the representation to a fixed dimension, Graph2Seq naturally scales to graphs of arbitrary size. Moreover, through analysis of a formal computational model we show that an unbounded sequence is necessary for scalability. Graph2Seq is also reversible, allowing full recovery of the graph structure from the sequence. Experimental evaluations of Graph2Seq on a variety of combinatorial optimization problems show strong generalization and strict improvement over state of the art. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shaileshh Bojja Venkatakrishnan;Mohammad Alizadeh;Pramod Viswanath", "authorids": "bjjvnkt@csail.mit.edu;alizadeh@csail.mit.edu;pramodv@illinois.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbojja2018graphseq,\ntitle={Graph2Seq: Scalable Learning Dynamics for Graphs},\nauthor={Shaileshh Bojja Venkatakrishnan and Mohammad Alizadeh and Pramod Viswanath},\nyear={2018},\nurl={https://openreview.net/forum?id=SJvu-GW0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJvu-GW0b", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;3;4", "rating_avg": 4.0, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3204986358554024947&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "SJw03ceRW", "title": "GENERATIVE LOW-SHOT NETWORK EXPANSION", "track": "main", "status": "Reject", "tldr": " In this paper, we address the problem of Low-shot network-expansion learning", "abstract": "Conventional deep learning classifiers are static in the sense that they are trained on\na predefined set of classes and learning to classify a novel class typically requires\nre-training. In this work, we address the problem of Low-shot network-expansion\nlearning. We introduce a learning framework which enables expanding a pre-trained\n(base) deep network to classify novel classes when the number of examples for the\nnovel classes is particularly small. We present a simple yet powerful distillation\nmethod where the base network is augmented with additional weights to classify\nthe novel classes, while keeping the weights of the base network unchanged. We\nterm this learning hard distillation, since we preserve the response of the network\non the old classes to be equal in both the base and the expanded network. We\nshow that since only a small number of weights needs to be trained, the hard\ndistillation excels for low-shot training scenarios. Furthermore, hard distillation\navoids detriment to classification performance on the base classes. Finally, we\nshow that low-shot network expansion can be done with a very small memory\nfootprint by using a compact generative model of the base classes training data\nwith only a negligible degradation relative to learning with the full training set.", "keywords": "Low-Shot Learning;class incremental learning;Network expansion;Generative model;Distillation", "primary_area": "", "supplementary_material": "", "author": "Adi Hayat;Mark Kliger;Shachar Fleishman;Daniel Cohen-Or", "authorids": "adi.hayat3@gmail.com;mark.kliger@gmail.com;shacharfl@gmail.com;cohenor@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nhayat2018generative,\ntitle={{GENERATIVE} {LOW}-{SHOT} {NETWORK} {EXPANSION}},\nauthor={Adi Hayat and Mark Kliger and Shachar Fleishman and Daniel Cohen-Or},\nyear={2018},\nurl={https://openreview.net/forum?id=SJw03ceRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SJw03ceRW", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;3;4", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8YMV0s9b_DEJ:scholar.google.com/&scioq=GENERATIVE+LOW-SHOT+NETWORK+EXPANSION&hl=en&as_sdt=0,33", "gs_version_total": 6 }, { "title": "Improving the Improved Training of Wasserstein GANs: A Consistency Term and Its Dual Effect", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/4", "id": "SJx9GQb0-", "author_site": "Xiang Wei, Boqing Gong, Zixia Liu, Wei Lu, Liqiang Wang", "tldr": "", "abstract": "Despite being impactful on a variety of problems and applications, the generative adversarial nets (GANs) are remarkably difficult to train. This issue is formally analyzed by \\cite{arjovsky2017towards}, who also propose an alternative direction to avoid the caveats in the minmax two-player training of GANs. The corresponding algorithm, namely, Wasserstein GAN (WGAN) hinges on the 1-Lipschitz continuity of the discriminators. In this paper, we propose a novel approach for enforcing the Lipschitz continuity in the training procedure of WGANs. Our approach seamlessly connects WGAN with one of the recent semi-supervised learning approaches. As a result, it gives rise to not only better photo-realistic samples than the previous methods but also state-of-the-art semi-supervised learning results. In particular, to the best of our knowledge, our approach gives rise to the inception score of more than 5.0 with only 1,000 CIFAR10 images and is the first that exceeds the accuracy of 90\\% the CIFAR10 datasets using only 4,000 labeled images.\n", "keywords": "GAN;WGAN", "primary_area": "", "supplementary_material": "", "author": "Xiang Wei;Boqing Gong;Zixia Liu;Wei Lu;Liqiang Wang", "authorids": "yqweixiang@knights.ucf.edu;boqinggo@outlook.com;zixia@knights.ucf.edu;luwei@bjtu.edu.cn;lwang@cs.ucf.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nwei2018improving,\ntitle={Improving the Improved Training of Wasserstein {GAN}s},\nauthor={Xiang Wei and Zixia Liu and Liqiang Wang and Boqing Gong},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SJx9GQb0-},\n}", "github": "[![github](/images/github_icon.svg) biuyq/CT-GAN](https://github.com/biuyq/CT-GAN)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;5;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 31, "authors#_avg": 5, "corr_rating_confidence": 0.18898223650461357, "gs_citation": 332, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3155067773578991569&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=SJx9GQb0-", "pdf": "https://openreview.net/pdf?id=SJx9GQb0-", "email": ";;;;", "author_num": 5 }, { "id": "SJxE3jlA-", "title": "Now I Remember! Episodic Memory For Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "Implementing and evaluating episodic memory for RL.", "abstract": "Humans rely on episodic memory constantly, in remembering the name of someone they met 10 minutes ago, the plot of a movie as it unfolds, or where they parked the car. Endowing reinforcement learning agents with episodic memory is a key step on the path toward replicating human-like general intelligence. We analyze why standard RL agents lack episodic memory today, and why existing RL tasks don't require it. We design a new form of external memory called Masked Experience Memory, or MEM, modeled after key features of human episodic memory. To evaluate episodic memory we define an RL task based on the common children's game of Concentration. We find that a MEM RL agent leverages episodic memory effectively to master Concentration, unlike the baseline agents we tested.", "keywords": "Reinforcement learning;Deep learning;Episodic memory", "primary_area": "", "supplementary_material": "", "author": "Ricky Loynd;Matthew Hausknecht;Lihong Li;Li Deng", "authorids": "riloynd@microsoft.com;mahauskn@microsoft.com;lihongli.cs@gmail.com;l.deng@ieee.org", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nloynd2018now,\ntitle={Now I Remember! Episodic Memory For Reinforcement Learning},\nauthor={Ricky Loynd and Matthew Hausknecht and Lihong Li and Li Deng},\nyear={2018},\nurl={https://openreview.net/forum?id=SJxE3jlA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SJxE3jlA-", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;5;5", "rating_avg": 4.0, "confidence_avg": 4.666666666666667, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9159947100431627471&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Learning Wasserstein Embeddings", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/305", "id": "SJyEH91A-", "author_site": "Nicolas Courty, R\u00e9mi Flamary, M\u00e9lanie Ducoffe", "tldr": "We show that it is possible to fastly approximate Wasserstein distances computation by finding an appropriate embedding where Euclidean distance emulates the Wasserstein distance", "abstract": "The Wasserstein distance received a lot of attention recently in the community of machine learning, especially for its principled way of comparing distributions. It has found numerous applications in several hard problems, such as domain adaptation, dimensionality reduction or generative models. However, its use is still limited by a heavy computational cost. Our goal is to alleviate this problem by providing an approximation mechanism that allows to break its inherent complexity. It relies on the search of an embedding where the Euclidean distance mimics the Wasserstein distance. We show that such an embedding can be found with a siamese architecture associated with a decoder network that allows to move from the embedding space back to the original input space. Once this embedding has been found, computing optimization problems in the Wasserstein space (e.g. barycenters, principal directions or even archetypes) can be conducted extremely fast. Numerical experiments supporting this idea are conducted on image datasets, and show the wide potential benefits of our method.", "keywords": "Wasserstein distance;metric embedding;Siamese architecture", "primary_area": "", "supplementary_material": "", "author": "Nicolas Courty;R\u00e9mi Flamary;M\u00e9lanie Ducoffe", "authorids": "ncourty@irisa.fr;remi.flamary@unice.fr;ducoffe@i3s.unice.fr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ncourty2018learning,\ntitle={Learning Wasserstein Embeddings},\nauthor={Nicolas Courty and R\u00e9mi Flamary and M\u00e9lanie Ducoffe},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SJyEH91A-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;3;4", "rating_avg": 7.0, "confidence_avg": 3.3333333333333335, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 73, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14250377766566233440&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=SJyEH91A-", "pdf": "https://openreview.net/pdf?id=SJyEH91A-", "email": ";;", "author_num": 3 }, { "title": "Fraternal Dropout", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/5", "id": "SJyVzQ-C-", "author_site": "Konrad Zolna, Devansh Arpit, Dendi Suhubdy, Yoshua Bengio", "tldr": "We propose to train two identical copies of an recurrent neural network (that share parameters) with different dropout masks while minimizing the difference between their (pre-softmax) predictions.", "abstract": "Recurrent neural networks (RNNs) are important class of architectures among neural networks useful for language modeling and sequential prediction. However, optimizing RNNs is known to be harder compared to feed-forward neural networks. A number of techniques have been proposed in literature to address this problem. In this paper we propose a simple technique called fraternal dropout that takes advantage of dropout to achieve this goal. Specifically, we propose to train two identical copies of an RNN (that share parameters) with different dropout masks while minimizing the difference between their (pre-softmax) predictions. In this way our regularization encourages the representations of RNNs to be invariant to dropout mask, thus being robust. We show that our regularization term is upper bounded by the expectation-linear dropout objective which has been shown to address the gap due to the difference between the train and inference phases of dropout. We evaluate our model and achieve state-of-the-art results in sequence modeling tasks on two benchmark datasets - Penn Treebank and Wikitext-2. We also show that our approach leads to performance improvement by a significant margin in image captioning (Microsoft COCO) and semi-supervised (CIFAR-10) tasks.", "keywords": "fraternal dropout;activity regularization;recurrent neural networks;RNN;LSTM;faster convergence", "primary_area": "", "supplementary_material": "", "author": "Konrad Zolna;Devansh Arpit;Dendi Suhubdy;Yoshua Bengio", "authorids": "konrad.zolna@gmail.com;devansh.arpit@umontreal.ca;dasuhubd@ncsu.edu;bengioy@iro.umontreal.ca", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nzolna2018fraternal,\ntitle={Fraternal Dropout},\nauthor={Konrad Zolna and Devansh Arpit and Dendi Suhubdy and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SJyVzQ-C-},\n}", "github": "[![github](/images/github_icon.svg) kondiz/fraternal-dropout](https://github.com/kondiz/fraternal-dropout)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4593127166702636404&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=SJyVzQ-C-", "pdf": "https://openreview.net/pdf?id=SJyVzQ-C-", "email": ";;;", "author_num": 4 }, { "id": "SJyfrl-0b", "title": "Fast Node Embeddings: Learning Ego-Centric Representations", "track": "main", "status": "Workshop", "tldr": "A faster method for generating node embeddings that employs a number of permutations over a node's immediate neighborhood as context to generate its representation.", "abstract": "Representation learning is one of the foundations of Deep Learning and allowed important improvements on several Machine Learning tasks, such as Neural Machine Translation, Question Answering and Speech Recognition. Recent works have proposed new methods for learning representations for nodes and edges in graphs. Several of these methods are based on the SkipGram algorithm, and they usually process a large number of multi-hop neighbors in order to produce the context from which node representations are learned. In this paper, we propose an effective and also efficient method for generating node embeddings in graphs that employs a restricted number of permutations over the immediate neighborhood of a node as context to generate its representation, thus ego-centric representations. We present a thorough evaluation showing that our method outperforms state-of-the-art methods in six different datasets related to the problems of link prediction and node classification, being one to three orders of magnitude faster than baselines when generating node embeddings for very large graphs.", "keywords": "Graph;Node Embeddings;Distributed Representations;Learning Representations", "primary_area": "", "supplementary_material": "", "author": "Tiago Pimentel;Adriano Veloso;Nivio Ziviani", "authorids": "tpimentel@dcc.ufmg.br;adrianov@dcc.ufmg.br;nivio@dcc.ufmg.br", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\npimentel2018fast,\ntitle={Fast Node Embeddings: Learning Ego-Centric Representations},\nauthor={Tiago Pimentel and Adriano Veloso and Nivio Ziviani},\nyear={2018},\nurl={https://openreview.net/forum?id=SJyfrl-0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJyfrl-0b", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;4;4", "rating_avg": 5.0, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11491617788373538815&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SJzMATlAZ", "title": "Deep Continuous Clustering", "track": "main", "status": "Reject", "tldr": "A clustering algorithm that performs joint nonlinear dimensionality reduction and clustering by optimizing a global continuous objective.", "abstract": "Clustering high-dimensional datasets is hard because interpoint distances become less informative in high-dimensional spaces. We present a clustering algorithm that performs nonlinear dimensionality reduction and clustering jointly. The data is embedded into a lower-dimensional space by a deep autoencoder. The autoencoder is optimized as part of the clustering process. The resulting network produces clustered data. The presented approach does not rely on prior knowledge of the number of ground-truth clusters. Joint nonlinear dimensionality reduction and clustering are formulated as optimization of a global continuous objective. We thus avoid discrete reconfigurations of the objective that characterize prior clustering algorithms. Experiments on datasets from multiple domains demonstrate that the presented algorithm outperforms state-of-the-art clustering schemes, including recent methods that use deep networks.", "keywords": "clustering;dimensionality reduction", "primary_area": "", "supplementary_material": "", "author": "Sohil Atul Shah;Vladlen Koltun", "authorids": "sohilas@umd.edu;vkoltun@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\natul2018deep,\ntitle={Deep Continuous Clustering},\nauthor={Sohil Atul Shah and Vladlen Koltun},\nyear={2018},\nurl={https://openreview.net/forum?id=SJzMATlAZ},\n}", "github": "[![github](/images/github_icon.svg) shahsohil/DCC](https://github.com/shahsohil/DCC) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=SJzMATlAZ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJzMATlAZ", "pdf_size": 0, "rating": "3;6;7", "confidence": "5;3;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 14, "authors#_avg": 2, "corr_rating_confidence": -0.7205766921228921, "gs_citation": 97, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14928213225246835240&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "title": "Latent Space Oddity: on the Curvature of Deep Generative Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/130", "id": "SJzRZ-WCZ", "author_site": "Georgios Arvanitidis, Lars Kai Hansen, S\u00f8ren Hauberg", "tldr": "", "abstract": "Deep generative models provide a systematic way to learn nonlinear data distributions through a set of latent variables and a nonlinear \"generator\" function that maps latent points into the input space. The nonlinearity of the generator implies that the latent space gives a distorted view of the input space. Under mild conditions, we show that this distortion can be characterized by a stochastic Riemannian metric, and we demonstrate that distances and interpolants are significantly improved under this metric. This in turn improves probability distributions, sampling algorithms and clustering in the latent space. Our geometric analysis further reveals that current generators provide poor variance estimates and we propose a new generator architecture with vastly improved variance estimates. Results are demonstrated on convolutional and fully connected variational autoencoders, but the formalism easily generalizes to other deep generative models.", "keywords": "Generative models;Riemannian Geometry;Latent Space", "primary_area": "", "supplementary_material": "", "author": "Georgios Arvanitidis;Lars Kai Hansen;S\u00f8ren Hauberg", "authorids": "gear@dtu.dk;lkai@dtu.dk;sohau@dtu.dk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\narvanitidis2018latent,\ntitle={Latent Space Oddity: on the Curvature of Deep Generative Models},\nauthor={Georgios Arvanitidis and Lars Kai Hansen and S\u00f8ren Hauberg},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SJzRZ-WCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "3;7;7", "confidence": "4;3;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": -0.5, "gs_citation": 308, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10064730276614480748&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=SJzRZ-WCZ", "pdf": "https://openreview.net/pdf?id=SJzRZ-WCZ", "email": ";;", "author_num": 3 }, { "id": "SJzmJEq6W", "title": "Learning non-linear transform with discriminative and minimum information loss priors", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper proposes a novel approach for learning discriminative and sparse representations. It consists of utilizing two different models. A predefined number of non-linear transform models are used in the learning stage, and one sparsifying transform model is used at test time. The non-linear transform models have discriminative and minimum information loss priors. A novel measure related to the discriminative prior is proposed and defined on the support intersection for the transform representations. The minimum information loss prior is expressed as a constraint on the conditioning and the expected coherence of the transform matrix. An equivalence between the non-linear models and the sparsifying model is shown only when the measure that is used to define the discriminative prior goes to zero. An approximation of the measure used in the discriminative prior is addressed, connecting it to a similarity concentration. To quantify the discriminative properties of the transform representation, we introduce another measure and present its bounds. Reflecting the discriminative quality of the transform representation we name it as discrimination power. \n\nTo support and validate the theoretical analysis a practical learning algorithm is presented. We evaluate the advantages and the potential of the proposed algorithm by a computer simulation. A favorable performance is shown considering the execution time, the quality of the representation, measured by the discrimination power and the recognition accuracy in comparison with the state-of-the-art methods of the same category.", "keywords": "transform learning;sparse representation;discrimininative prior;information preservation;discrimination power", "primary_area": "", "supplementary_material": "", "author": "Dimche Kostadinov;Slava Voloshynovskiy", "authorids": "dimche.kostadinov@unige.ch;svolos@unige.ch", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkostadinov2018learning,\ntitle={Learning non-linear transform with discriminative and minimum information loss priors},\nauthor={Dimche Kostadinov and Slava Voloshynovskiy},\nyear={2018},\nurl={https://openreview.net/forum?id=SJzmJEq6W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SJzmJEq6W", "pdf_size": 0, "rating": "4;5;5", "confidence": "2;1;2", "rating_avg": 4.666666666666667, "confidence_avg": 1.6666666666666667, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4931028464566653409&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "Sk03Yi10Z", "title": "An Ensemble of Retrieval-Based and Generation-Based Human-Computer Conversation Systems.", "track": "main", "status": "Reject", "tldr": "A novel ensemble of retrieval-based and generation-based for open-domain conversation systems.", "abstract": "Human-computer conversation systems have attracted much attention in Natural Language Processing. Conversation systems can be roughly divided into two categories: retrieval-based and generation-based systems. Retrieval systems search a user-issued utterance (namely a query) in a large conversational repository and return a reply that best matches the query. Generative approaches synthesize new replies. Both ways have certain advantages but suffer from their own disadvantages. We propose a novel ensemble of retrieval-based and generation-based conversation system. The retrieved candidates, in addition to the original query, are fed to a reply generator via a neural network, so that the model is aware of more information. The generated reply together with the retrieved ones then participates in a re-ranking process to find the final reply to output. Experimental results show that such an ensemble system outperforms each single module by a large margin.\n", "keywords": "conversation systems;retrieval method;generation method", "primary_area": "", "supplementary_material": "", "author": "Yiping Song;Rui Yan;Cheng-Te Li;Jian-Yun Nie;Ming Zhang;Dongyan Zhao", "authorids": "songyiping@pku.edu.cn;ruiyan@pku.edu.cn;chengte@mail.ncku.edu.tw;nie@iro.umontreal.ca;mzhang_cs@pku.edu.cn;zhaody@pku.edu.cn", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nsong2018an,\ntitle={An Ensemble of Retrieval-Based and Generation-Based Human-Computer Conversation Systems.},\nauthor={Yiping Song and Rui Yan and Cheng-Te Li and Jian-Yun Nie and Ming Zhang and Dongyan Zhao},\nyear={2018},\nurl={https://openreview.net/forum?id=Sk03Yi10Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Sk03Yi10Z", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;3;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.0, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 141, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6412463055600365068&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "Sk0pHeZAW", "title": "Sparse Regularized Deep Neural Networks For Efficient Embedded Learning", "track": "main", "status": "Reject", "tldr": "Compression of Deep neural networks deployed on embedded device. ", "abstract": "Deep learning is becoming more widespread in its application due to its power in solving complex classification problems. However, deep learning models often require large memory and energy consumption, which may prevent them from being deployed effectively on embedded platforms, limiting their applications. This work addresses the problem by proposing methods {\\em Weight Reduction Quantisation} for compressing the memory footprint of the models, including reducing the number of weights and the number of bits to store each weight. Beside, applying with sparsity-inducing regularization, our work focuses on speeding up stochastic variance reduced gradients (SVRG) optimization on non-convex problem. Our method that mini-batch SVRG with $\\ell$1 regularization on non-convex problem has faster and smoother convergence rates than SGD by using adaptive learning rates. Experimental evaluation of our approach uses MNIST and CIFAR-10 datasets on LeNet-300-100 and LeNet-5 models, showing our approach can reduce the memory requirements both in the convolutional and fully connected layers by up to 60$\\times$ without affecting their test accuracy.", "keywords": "Sparse representation;Compression Deep Learning Models;L1 regularisation;Optimisation.", "primary_area": "", "supplementary_material": "", "author": "Jia Bi", "authorids": "jb4e14@soton.ac.uk", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nbi2018sparse,\ntitle={Sparse Regularized Deep Neural Networks For Efficient Embedded Learning},\nauthor={Jia Bi},\nyear={2018},\nurl={https://openreview.net/forum?id=Sk0pHeZAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Sk0pHeZAW", "pdf_size": 0, "rating": "2;4;4", "confidence": "3;5;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 1, "corr_rating_confidence": 0.8660254037844387, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TtuY5GwNTR4J:scholar.google.com/&scioq=Sparse+Regularized+Deep+Neural+Networks+For+Efficient+Embedded+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "Sk1NTfZAb", "title": "Key Protected Classification for GAN Attack Resilient Collaborative Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Large-scale publicly available datasets play a fundamental role in training deep learning models. However, large-scale\ndatasets are difficult to collect in problems that involve processing of sensitive information.\nCollaborative learning techniques provide a privacy-preserving solution in such cases, by enabling\ntraining over a number of private datasets that are not shared by their owners.\nExisting collaborative learning\ntechniques, combined with differential privacy, are shown to be resilient against a passive\nadversary which tries to infer the training data only from the model parameters. However, recently, it has\nbeen shown that the existing collaborative learning techniques are vulnerable to an active adversary that runs a GAN\nattack during the learning phase. In this work, we propose a novel key-based collaborative learning technique that is\nresilient against such GAN attacks. For this purpose, we present a collaborative learning formulation in which class scores \nare protected by class-specific keys, and therefore, prevents a GAN attack. We also show that\nvery high dimensional class-specific keys can be utilized to improve robustness against attacks, without increasing the model complexity. \nOur experimental results on two popular datasets, MNIST and AT&T Olivetti Faces, demonstrate the effectiveness of the proposed technique\nagainst the GAN attack. To the best of our knowledge, the proposed approach is the first collaborative learning\nformulation that effectively tackles an active adversary, and, unlike model corruption or differential privacy formulations,\nour approach does not inherently feature a trade-off between model accuracy and data privacy.", "keywords": "privacy preserving deep learning;collaborative learning;adversarial attack", "primary_area": "", "supplementary_material": "", "author": "Mert B\u00fclent Sar\u0131y\u0131ld\u0131z;Ramazan G\u00f6kberk Cinbi\u015f;Erman Ayday", "authorids": "mbsariyildiz@gmail.com;gokberkcinbis@gmail.com;erman@cs.bilkent.edu.tr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nb\u00fclent2018key,\ntitle={Key Protected Classification for {GAN} Attack Resilient Collaborative Learning},\nauthor={Mert B\u00fclent Sar\u0131y\u0131ld\u0131z and Ramazan G\u00f6kberk Cinbi\u015f and Erman Ayday},\nyear={2018},\nurl={https://openreview.net/forum?id=Sk1NTfZAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Sk1NTfZAb", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;2", "rating_avg": 4.0, "confidence_avg": 3.3333333333333335, "replies_avg": 16, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844387, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BUt5CsEmxkgJ:scholar.google.com/&scioq=Key+Protected+Classification+for+GAN+Attack+Resilient+Collaborative+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Continuous Adaptation via Meta-Learning in Nonstationary and Competitive Environments", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/171", "id": "Sk2u1g-0-", "author_site": "Maruan Al-Shedivat, Trapit Bansal, Yuri Burda, Ilya Sutskever, Igor Mordatch, Pieter Abbeel", "tldr": "", "abstract": "Ability to continuously learn and adapt from limited experience in nonstationary environments is an important milestone on the path towards general intelligence. In this paper, we cast the problem of continuous adaptation into the learning-to-learn framework. We develop a simple gradient-based meta-learning algorithm suitable for adaptation in dynamically changing and adversarial scenarios. Additionally, we design a new multi-agent competitive environment, RoboSumo, and define iterated adaptation games for testing various aspects of continuous adaptation. We demonstrate that meta-learning enables significantly more efficient adaptation than reactive baselines in the few-shot regime. Our experiments with a population of agents that learn and compete suggest that meta-learners are the fittest.", "keywords": "reinforcement learning;nonstationarity;meta-learning;transfer learning;multi-agent", "primary_area": "", "supplementary_material": "", "author": "Maruan Al-Shedivat;Trapit Bansal;Yura Burda;Ilya Sutskever;Igor Mordatch;Pieter Abbeel", "authorids": "alshedivat@cs.cmu.edu;tbansal@cs.umass.edu;yburda@openai.com;ilyasu@openai.com;mordatch@openai.com;pabbeel@cs.berkeley.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nal-shedivat2018continuous,\ntitle={Continuous Adaptation via Meta-Learning in Nonstationary and Competitive Environments},\nauthor={Maruan Al-Shedivat and Trapit Bansal and Yura Burda and Ilya Sutskever and Igor Mordatch and Pieter Abbeel},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Sk2u1g-0-},\n}", "github": "[![github](/images/github_icon.svg) openai/robosumo](https://github.com/openai/robosumo)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;8;9", "confidence": "4;4;2", "rating_avg": 8.0, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 6, "corr_rating_confidence": -0.8660254037844387, "gs_citation": 451, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10800934967753473866&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=Sk2u1g-0-", "pdf": "https://openreview.net/pdf?id=Sk2u1g-0-", "email": ";;;;;", "author_num": 6 }, { "id": "Sk4w0A0Tb", "title": "Rotational Unit of Memory", "track": "main", "status": "Workshop", "tldr": "A novel RNN model which outperforms significantly the current frontier of models in a variety of sequential tasks.", "abstract": "The concepts of unitary evolution matrices and associative memory have boosted the field of Recurrent Neural Networks (RNN) to state-of-the-art performance in a variety of sequential tasks. However, RNN still has a limited capacity to manipulate long-term memory. To bypass this weakness the most successful applications of RNN use external techniques such as attention mechanisms. In this paper we propose a novel RNN model that unifies the state-of-the-art approaches: Rotational Unit of Memory (RUM). The core of RUM is its rotational operation, which is, naturally, a unitary matrix, providing architectures with the power to learn long-term dependencies by overcoming the vanishing and exploding gradients problem. Moreover, the rotational unit also serves as associative memory. We evaluate our model on synthetic memorization, question answering and language modeling tasks. RUM learns the Copying Memory task completely and improves the state-of-the-art result in the Recall task. RUM\u2019s performance in the bAbI Question Answering task is comparable to that of models with attention mechanism. We also improve the state-of-the-art result to 1.189 bits-per-character (BPC) loss in the Character Level Penn Treebank (PTB) task, which is to signify the applications of RUM to real-world sequential data. The universality of our construction, at the core of RNN, establishes RUM as a promising approach to language modeling, speech recognition and machine translation.", "keywords": "RNN;unitary approach;associative memory;language modeling", "primary_area": "", "supplementary_material": "", "author": "Rumen Dangovski;Li Jing;Marin Soljacic", "authorids": "rumenrd@mit.edu;ljing@mit.edu;soljacic@mit.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ndangovski2018rotational,\ntitle={Rotational Unit of Memory },\nauthor={Rumen Dangovski and Li Jing and Marin Soljacic},\nyear={2018},\nurl={https://openreview.net/forum?id=Sk4w0A0Tb},\n}", "github": "[![github](/images/github_icon.svg) jingli9111/RUM](https://github.com/jingli9111/RUM) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=Sk4w0A0Tb)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Sk4w0A0Tb", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;3", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 15, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7973974071032448360&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "title": "Espresso: Efficient Forward Propagation for Binary Deep Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/304", "id": "Sk6fD5yCb", "author_site": "Fabrizio Pedersoli, George Tzanetakis, Andrea Tagliasacchi", "tldr": "state-of-the-art computational performance implementation of binary neural networks", "abstract": " There are many applications scenarios for which the computational\n performance and memory footprint of the prediction phase of Deep\n Neural Networks (DNNs) need to be optimized. Binary Deep Neural\n Networks (BDNNs) have been shown to be an effective way of achieving\n this objective. In this paper, we show how Convolutional Neural\n Networks (CNNs) can be implemented using binary\n representations. Espresso is a compact, yet powerful\n library written in C/CUDA that features all the functionalities\n required for the forward propagation of CNNs, in a binary file less\n than 400KB, without any external dependencies. Although it is mainly\n designed to take advantage of massive GPU parallelism, Espresso also\n provides an equivalent CPU implementation for CNNs. Espresso\n provides special convolutional and dense layers for BCNNs,\n leveraging bit-packing and bit-wise computations\n for efficient execution. These techniques provide a speed-up of\n matrix-multiplication routines, and at the same time, reduce memory\n usage when storing parameters and activations. We experimentally\n show that Espresso is significantly faster than existing\n implementations of optimized binary neural networks (~ 2\n orders of magnitude). Espresso is released under the Apache 2.0\n license and is available at http://github.com/organization/project.", "keywords": "binary deep neural networks;optimized implementation;bitwise computations", "primary_area": "", "supplementary_material": "", "author": "Fabrizio Pedersoli;George Tzanetakis;Andrea Tagliasacchi", "authorids": "fpeder@uvic.ca;gtzan@uvic.ca;ataiya@uvic.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\npedersoli2018espresso,\ntitle={Espresso: Efficient Forward Propagation for Binary Deep Neural Networks},\nauthor={Fabrizio Pedersoli and George Tzanetakis and Andrea Tagliasacchi},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Sk6fD5yCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;4;1", "rating_avg": 6.666666666666667, "confidence_avg": 2.6666666666666665, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": -0.18898223650461363, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1646875788450055806&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=Sk6fD5yCb", "pdf": "https://openreview.net/pdf?id=Sk6fD5yCb", "email": ";;", "author_num": 3 }, { "title": "Lifelong Learning with Dynamically Expandable Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/37", "id": "Sk7KsfW0-", "author_site": "Jaehong Yoon, Eunho Yang, Jeongtae Lee, Sung Ju Hwang", "tldr": "We propose a novel deep network architecture that can dynamically decide its network capacity as it trains on a lifelong learning scenario.", "abstract": "We propose a novel deep network architecture for lifelong learning which we refer to as Dynamically Expandable Network (DEN), that can dynamically decide its network capacity as it trains on a sequence of tasks, to learn a compact overlapping knowledge sharing structure among tasks. DEN is efficiently trained in an online manner by performing selective retraining, dynamically expands network capacity upon arrival of each task with only the necessary number of units, and effectively prevents semantic drift by splitting/duplicating units and timestamping them. We validate DEN on multiple public datasets in lifelong learning scenarios on multiple public datasets, on which it not only significantly outperforms existing lifelong learning methods for deep networks, but also achieves the same level of performance as the batch model with substantially fewer number of parameters. ", "keywords": "Transfer learning;Lifelong learning;Selective retraining;Dynamic network expansion", "primary_area": "", "supplementary_material": "", "author": "Jaehong Yoon;Eunho Yang;Jeongtae Lee;Sung Ju Hwang", "authorids": "mmvc98@unist.ac.kr;eunhoy@kaist.ac.kr;jtlee@unist.ac.kr;sjhwang82@kaist.ac.kr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nyoon2018lifelong,\ntitle={Lifelong Learning with Dynamically Expandable Networks},\nauthor={Jaehong Yoon and Eunho Yang and Jeongtae Lee and Sung Ju Hwang},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Sk7KsfW0-},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=Sk7KsfW0-)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;3;2", "rating_avg": 7.0, "confidence_avg": 2.6666666666666665, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1448, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10130565682575038178&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=Sk7KsfW0-", "pdf": "https://openreview.net/pdf?id=Sk7KsfW0-", "email": ";;;", "author_num": 4 }, { "id": "Sk7cHb-C-", "title": "Representing dynamically: An active process for describing sequential data", "track": "main", "status": "Reject", "tldr": "A method that build representations of sequential data and its dynamics through generative models with an active process", "abstract": "We propose an unsupervised method for building dynamic representations of sequential data, particularly of observed interactions. The method simultaneously acquires representations of input data and its dynamics. It is based on a hierarchical generative model composed of two levels. In the first level, a model learns representations to generate observed data. In the second level, representational states encode the dynamics of the lower one. The model is designed as a Bayesian network with switching variables represented in the higher level, and which generates transition models. The method actively explores the latent space guided by its knowledge and the uncertainty about it. That is achieved by updating the latent variables from prediction error signals backpropagated to the latent space. So, no encoder or inference models are used since the generators also serve as their inverse transformations.\nThe method is evaluated in two scenarios, with static images and with videos. The results show that the adaptation over time leads to better performance than with similar architectures without temporal dependencies, e.g., variational autoencoders. With videos, it is shown that the system extracts the dynamics of the data in states that highly correlate with the ground truth of the actions observed.", "keywords": "Generative Models;Latent representations;Predictive coding;Recurrent networks;Sequential data", "primary_area": "", "supplementary_material": "", "author": "Juan Sebastian Olier;Emilia Barakova;Matthias Rauterberg;Carlo Regazzoni", "authorids": "j.s.olier.jauregui@tue.nl;e.i.barakova@tue.nl;g.w.m.rauterberg@tue.nl;carlo.regazzoni@unige.it", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsebastian2018representing,\ntitle={Representing dynamically: An active process for describing sequential data},\nauthor={Juan Sebastian Olier and Emilia Barakova and Matthias Rauterberg and Carlo Regazzoni},\nyear={2018},\nurl={https://openreview.net/forum?id=Sk7cHb-C-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Sk7cHb-C-", "pdf_size": 0, "rating": "3;4;4;6", "confidence": "3;4;4;3", "rating_avg": 4.25, "confidence_avg": 3.5, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": -0.2294157338705618, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Jt26HDGPtyQJ:scholar.google.com/&scioq=Representing+dynamically:+An+active+process+for+describing+sequential+data&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Mitigating Adversarial Effects Through Randomization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/238", "id": "Sk9yuql0Z", "author_site": "Cihang Xie, Jianyu Wang, Zhishuai Zhang, Zhou Ren, Alan Yuille", "tldr": "", "abstract": "Convolutional neural networks have demonstrated high accuracy on various tasks in recent years. However, they are extremely vulnerable to adversarial examples. For example, imperceptible perturbations added to clean images can cause convolutional neural networks to fail. In this paper, we propose to utilize randomization at inference time to mitigate adversarial effects. Specifically, we use two randomization operations: random resizing, which resizes the input images to a random size, and random padding, which pads zeros around the input images in a random manner. Extensive experiments demonstrate that the proposed randomization method is very effective at defending against both single-step and iterative attacks. Our method provides the following advantages: 1) no additional training or fine-tuning, 2) very few additional computations, 3) compatible with other adversarial defense methods. By combining the proposed randomization method with an adversarially trained model, it achieves a normalized score of 0.924 (ranked No.2 among 107 defense teams) in the NIPS 2017 adversarial examples defense challenge, which is far better than using adversarial training alone with a normalized score of 0.773 (ranked No.56). The code is public available at https://github.com/cihangxie/NIPS2017_adv_challenge_defense.", "keywords": "adversarial examples", "primary_area": "", "supplementary_material": "", "author": "Cihang Xie;Jianyu Wang;Zhishuai Zhang;Zhou Ren;Alan Yuille", "authorids": "cihangxie306@gmail.com;wjyouch@gmail.com;zhshuai.zhang@gmail.com;zhou.ren@snapchat.com;alan.l.yuille@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nxie2018mitigating,\ntitle={Mitigating Adversarial Effects Through Randomization},\nauthor={Cihang Xie and Jianyu Wang and Zhishuai Zhang and Zhou Ren and Alan Yuille},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Sk9yuql0Z},\n}", "github": "[![github](/images/github_icon.svg) cihangxie/NIPS2017_adv_challenge_defense](https://github.com/cihangxie/NIPS2017_adv_challenge_defense) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=Sk9yuql0Z)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;4;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 5, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1391, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1119418123159333221&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=Sk9yuql0Z", "pdf": "https://openreview.net/pdf?id=Sk9yuql0Z", "email": ";;;;", "author_num": 5 }, { "title": "When is a Convolutional Filter Easy to Learn?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/317", "id": "SkA-IE06W", "author_site": "Simon Du, Jason D Lee, Yuandong Tian", "tldr": "We prove randomly initialized (stochastic) gradient descent learns a convolutional filter in polynomial time.", "abstract": "We analyze the convergence of (stochastic) gradient descent algorithm for learning a convolutional filter with Rectified Linear Unit (ReLU) activation function. Our analysis does not rely on any specific form of the input distribution and our proofs only use the definition of ReLU, in contrast with previous works that are restricted to standard Gaussian input. We show that (stochastic) gradient descent with random initialization can learn the convolutional filter in polynomial time and the convergence rate depends on the smoothness of the input distribution and the closeness of patches. To the best of our knowledge, this is the first recovery guarantee of gradient-based algorithms for convolutional filter on non-Gaussian input distributions. Our theory also justifies the two-stage learning rate strategy in deep neural networks. While our focus is theoretical, we also present experiments that justify our theoretical findings.", "keywords": "deep learning;convolutional neural network;non-convex optimization;convergence analysis", "primary_area": "", "supplementary_material": "", "author": "Simon S. Du;Jason D. Lee;Yuandong Tian", "authorids": "ssdu@cs.cmu.edu;jasonlee@marshall.usc.edu;yuandong@fb.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ns.2018when,\ntitle={When is a Convolutional Filter Easy to Learn?},\nauthor={Simon S. Du and Jason D. Lee and Yuandong Tian},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SkA-IE06W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2", "pdf_size": 0, "rating": "6;8;9", "confidence": "3;3;4", "rating_avg": 7.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.7559289460184545, "gs_citation": 149, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=541769035189129637&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=SkA-IE06W", "pdf": "https://openreview.net/pdf?id=SkA-IE06W", "email": ";;", "author_num": 3 }, { "id": "SkAK2jg0b", "title": "An Out-of-the-box Full-network Embedding for Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "We present a full-network embedding of CNN which outperforms single layer embeddings for transfer learning tasks.", "abstract": "Transfer learning for feature extraction can be used to exploit deep representations in contexts where there is very few training data, where there are limited computational resources, or when tuning the hyper-parameters needed for training is not an option. While previous contributions to feature extraction propose embeddings based on a single layer of the network, in this paper we propose a full-network embedding which successfully integrates convolutional and fully connected features, coming from all layers of a deep convolutional neural network. To do so, the embedding normalizes features in the context of the problem, and discretizes their values to reduce noise and regularize the embedding space. Significantly, this also reduces the computational cost of processing the resultant representations. The proposed method is shown to outperform single layer embeddings on several image classification tasks, while also being more robust to the choice of the pre-trained model used for obtaining the initial features. The performance gap in classification accuracy between thoroughly tuned solutions and the full-network embedding is also reduced, which makes of the proposed approach a competitive solution for a large set of applications.", "keywords": "Embedding spaces;feature extraction;transfer learning.", "primary_area": "", "supplementary_material": "", "author": "Dario Garcia-Gasulla;Armand Vilalta;Ferran Par\u00e9s;Jonatan Moreno;Eduard Ayguad\u00e9;Jes\u00fas Labarta;Ulises Cort\u00e9s;Toyotaro Suzumura", "authorids": "dario.garcia@bsc.es;armand.vilalta@bsc.es;ferran.pares@bsc.es;jonatan.moreno@bsc.es;eduard.ayguade@bsc.es;jesus.labarta@bsc.es;ia@cs.upc.edu;suzumurat@gmail.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\ngarcia-gasulla2018an,\ntitle={An Out-of-the-box Full-network Embedding for Convolutional Neural Networks},\nauthor={Dario Garcia-Gasulla and Armand Vilalta and Ferran Par\u00e9s and Jonatan Moreno and Eduard Ayguad\u00e9 and Jes\u00fas Labarta and Ulises Cort\u00e9s and Toyotaro Suzumura},\nyear={2018},\nurl={https://openreview.net/forum?id=SkAK2jg0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SkAK2jg0b", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;5;5", "rating_avg": 3.6666666666666665, "confidence_avg": 4.666666666666667, "replies_avg": 4, "authors#_avg": 8, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6053152037112252668&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "SkBHr1WRW", "title": "Ego-CNN: An Ego Network-based Representation of Graphs Detecting Critical Structures", "track": "main", "status": "Reject", "tldr": "", "abstract": "While existing graph embedding models can generate useful embedding vectors that perform well on graph-related tasks, what valuable information can be jointly learned by a graph embedding model is less discussed. In this paper, we consider the possibility of detecting critical structures by a graph embedding model. We propose Ego-CNN to embed graph, which works in a local-to-global manner to take advantages of CNNs that gradually expanding the detectable local regions on the graph as the network depth increases. Critical structures can be detected if Ego-CNN is combined with a supervised task model. We show that Ego-CNN is (1) competitive to state-of-the-art graph embeddings models, (2) can nicely work with CNNs visualization techniques to show the detected structures, and (3) is efficient and can incorporate with scale-free priors, which commonly occurs in social network datasets, to further improve the training efficiency.", "keywords": "graph embedding;CNN", "primary_area": "", "supplementary_material": "", "author": "Ruo-Chun Tzeng;Shan-Hung Wu", "authorids": "rctzeng@datalab.cs.nthu.edu.tw;shwu@cs.nthu.edu.tw", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntzeng2018egocnn,\ntitle={Ego-{CNN}: An Ego Network-based Representation of Graphs Detecting Critical Structures},\nauthor={Ruo-Chun Tzeng and Shan-Hung Wu},\nyear={2018},\nurl={https://openreview.net/forum?id=SkBHr1WRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=SkBHr1WRW", "pdf_size": 0, "rating": "4;4;7", "confidence": "4;4;3", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": -1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:KKaNAxqdnJsJ:scholar.google.com/&scioq=Ego-CNN:+An+Ego+Network-based+Representation+of+Graphs+Detecting+Critical+Structures&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SkBYYyZRZ", "title": "Searching for Activation Functions", "track": "main", "status": "Workshop", "tldr": "We use search techniques to discover novel activation functions, and our best discovered activation function, f(x) = x * sigmoid(beta * x), outperforms ReLU on a number of challenging tasks like ImageNet.", "abstract": "The choice of activation functions in deep networks has a significant effect on the training dynamics and task performance. Currently, the most successful and widely-used activation function is the Rectified Linear Unit (ReLU). Although various hand-designed alternatives to ReLU have been proposed, none have managed to replace it due to inconsistent gains. In this work, we propose to leverage automatic search techniques to discover new activation functions. Using a combination of exhaustive and reinforcement learning-based search, we discover multiple novel activation functions. We verify the effectiveness of the searches by conducting an empirical evaluation with the best discovered activation function. Our experiments show that the best discovered activation function, f(x) = x * sigmoid(beta * x), which we name Swish, tends to work better than ReLU on deeper models across a number of challenging datasets. For example, simply replacing ReLUs with Swish units improves top-1 classification accuracy on ImageNet by 0.9% for Mobile NASNet-A and 0.6% for Inception-ResNet-v2. The simplicity of Swish and its similarity to ReLU make it easy for practitioners to replace ReLUs with Swish units in any neural network. ", "keywords": "meta learning;activation functions", "primary_area": "", "supplementary_material": "", "author": "Prajit Ramachandran;Barret Zoph;Quoc V. Le", "authorids": "prajitram@gmail.com;barretzoph@google.com;qvl@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nramachandran2018searching,\ntitle={Searching for Activation Functions},\nauthor={Prajit Ramachandran and Barret Zoph and Quoc V. Le},\nyear={2018},\nurl={https://openreview.net/forum?id=SkBYYyZRZ},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 21 community implementations](https://paperswithcode.com/paper/?openreview=SkBYYyZRZ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=SkBYYyZRZ", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;5;5", "rating_avg": 5.333333333333333, "confidence_avg": 4.666666666666667, "replies_avg": 17, "authors#_avg": 3, "corr_rating_confidence": 0.7559289460184544, "gs_citation": 4870, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=665017396840630897&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "SkBcLugC-", "title": "Fast and Accurate Inference with Adaptive Ensemble Prediction for Deep Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Ensembling multiple predictions is a widely-used technique to improve the accuracy of various machine learning tasks. In image classification tasks, for example, averaging the predictions for multiple patches extracted from the input image significantly improves accuracy. Using multiple networks trained independently to make predictions improves accuracy further. One obvious drawback of the ensembling technique is its higher execution cost during inference.% If we average 100 local predictions, the execution cost will be 100 times as high as the cost without the ensemble. This higher cost limits the real-world use of ensembling. In this paper, we first describe our insights on relationship between the probability of the prediction and the effect of ensembling with current deep neural networks; ensembling does not help mispredictions for inputs predicted with a high probability, i.e. the output from the softmax. This finding motivates us to develop a new technique called adaptive ensemble prediction, which achieves the benefits of ensembling with much smaller additional execution costs. Hence, we calculate the confidence level of the prediction for each input from the probabilities of the local predictions during the ensembling computation. If the prediction for an input reaches a high enough probability on the basis of the confidence level, we stop ensembling for this input to avoid wasting computation power. We evaluated the adaptive ensembling by using various datasets and showed that it reduces the computation cost significantly while achieving similar accuracy to the naive ensembling. We also showed that our statistically rigorous confidence-level-based termination condition reduces the burden of the task-dependent parameter tuning compared to the naive termination based on the pre-defined threshold in addition to yielding a better accuracy with the same cost.\n", "keywords": "ensemble;confidence level", "primary_area": "", "supplementary_material": "", "author": "Hiroshi Inoue", "authorids": "inouehrs@jp.ibm.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\ninoue2018fast,\ntitle={Fast and Accurate Inference with Adaptive Ensemble Prediction for Deep Networks},\nauthor={Hiroshi Inoue},\nyear={2018},\nurl={https://openreview.net/forum?id=SkBcLugC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SkBcLugC-", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 1, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16589562220513201908&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SkERSm-0-", "title": "Preliminary theoretical troubleshooting in Variational Autoencoder", "track": "main", "status": "Reject", "tldr": "This paper tries to preliminarily address the disentanglement theoretically in the idealistic situation and practically through noise modelling perspective in the realistic case.", "abstract": "What would be learned by variational autoencoder(VAE) and what influence the disentanglement of VAE? This paper tries to preliminarily address VAE's intrinsic dimension, real factor, disentanglement and indicator issues theoretically in the idealistic situation and implementation issue practically through noise modeling perspective in the realistic case. On intrinsic dimension issue, due to information conservation, the idealistic VAE learns and only learns intrinsic factor dimension. Besides, suggested by mutual information separation property, the constraint induced by Gaussian prior to the VAE objective encourages the information sparsity in dimension. On disentanglement issue, subsequently, inspired by information conservation theorem the clarification on disentanglement in this paper is made. On real factor issue, due to factor equivalence, the idealistic VAE possibly learns any factor set in the equivalence class. On indicator issue, the behavior of current disentanglement metric is discussed, and several performance indicators regarding the disentanglement and generating influence are subsequently raised to evaluate the performance of VAE model and to supervise the used factors. On implementation issue, the experiments under noise modeling and constraints empirically testify the theoretical analysis and also show their own characteristic in pursuing disentanglement.", "keywords": "variational autoencoder;information theory;noise modelling;representation learning;generative model;disentanglement", "primary_area": "", "supplementary_material": "", "author": "Shiqi Liu;Qian Zhao;Xiangyong Cao;Deyu Meng;Zilu Ma;Tao Yu", "authorids": "liushiqi@stu.xjtu.edu.cn;dymeng@mail.xjtu.edu.cn;timmy.zhaoqian@gmail.com;460376821@qq.com;1030884089@qq.com;602077855@qq.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nliu2018preliminary,\ntitle={Preliminary theoretical troubleshooting in Variational Autoencoder},\nauthor={Shiqi Liu and Qian Zhao and Xiangyong Cao and Deyu Meng},\nyear={2018},\nurl={https://openreview.net/forum?id=SkERSm-0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SkERSm-0-", "pdf_size": 0, "rating": "2;3;5", "confidence": "4;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:KBMgxtr65CIJ:scholar.google.com/&scioq=Preliminary+theoretical+troubleshooting+in+Variational+Autoencoder&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SkF2D7g0b", "title": "Exploring the Space of Black-box Attacks on Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "Query-based black-box attacks on deep neural networks with adversarial success rates matching white-box attacks", "abstract": "Existing black-box attacks on deep neural networks (DNNs) so far have largely focused on transferability, where an adversarial instance generated for a locally trained model can \u201ctransfer\u201d to attack other learning models. In this paper, we propose novel Gradient Estimation black-box attacks for adversaries with query access to the target model\u2019s class probabilities, which do not rely on transferability. We also propose strategies to decouple the number of queries required to generate each adversarial sample from the dimensionality of the input. An iterative variant of our attack achieves close to 100% adversarial success rates for both targeted and untargeted attacks on DNNs. We carry out extensive experiments for a thorough comparative evaluation of black-box attacks and show that the proposed Gradient Estimation attacks outperform all transferability based black-box attacks we tested on both MNIST and CIFAR-10 datasets, achieving adversarial success rates similar to well known, state-of-the-art white-box attacks. We also apply the Gradient Estimation attacks successfully against a real-world content moderation classi\ufb01er hosted by Clarifai. Furthermore, we evaluate black-box attacks against state-of-the-art defenses. We show that the Gradient Estimation attacks are very effective even against these defenses.", "keywords": "adversarial machine learning;black-box attacks", "primary_area": "", "supplementary_material": "", "author": "Arjun Nitin Bhagoji;Warren He;Bo Li;Dawn Song", "authorids": "abhagoji@princeton.edu;_w@eecs.berkeley.edu;lxbosky@gmail.com;dawnsong@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nnitin2018exploring,\ntitle={Exploring the Space of Black-box Attacks on Deep Neural Networks},\nauthor={Arjun Nitin Bhagoji and Warren He and Bo Li and Dawn Song},\nyear={2018},\nurl={https://openreview.net/forum?id=SkF2D7g0b},\n}", "github": "[![github](/images/github_icon.svg) sunblaze-ucb/blackbox-attacks](https://github.com/sunblaze-ucb/blackbox-attacks)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SkF2D7g0b", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;3;4", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 98, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7934932563374222928&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "title": "VoiceLoop: Voice Fitting and Synthesis via a Phonological Loop", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/220", "id": "SkFAWax0-", "author_site": "Yaniv Taigman, Lior Wolf, Adam Polyak, Eliya Nachmani", "tldr": "", "abstract": "We present a new neural text to speech (TTS) method that is able to transform text to speech in voices that are sampled in the wild. Unlike other systems, our solution is able to deal with unconstrained voice samples and without requiring aligned phonemes or linguistic features. The network architecture is simpler than those in the existing literature and is based on a novel shifting buffer working memory. The same buffer is used for estimating the attention, computing the output audio, and for updating the buffer itself. The input sentence is encoded using a context-free lookup table that contains one entry per character or phoneme. The speakers are similarly represented by a short vector that can also be fitted to new identities, even with only a few samples. Variability in the generated speech is achieved by priming the buffer prior to generating the audio. Experimental results on several datasets demonstrate convincing capabilities, making TTS accessible to a wider range of applications. In order to promote reproducibility, we release our source code and models.", "keywords": "Voice Synthesis;Multi-Speaker;Differentiable Memory;Text-to-Speech", "primary_area": "", "supplementary_material": "", "author": "Yaniv Taigman;Lior Wolf;Adam Polyak;Eliya Nachmani", "authorids": "yaniv@fb.com;wolf@fb.com;adampolyak@fb.com;enk100@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ntaigman2018voiceloop,\ntitle={VoiceLoop: Voice Fitting and Synthesis via a Phonological Loop},\nauthor={Yaniv Taigman and Lior Wolf and Adam Polyak and Eliya Nachmani},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SkFAWax0-},\n}", "github": "[![github](/images/github_icon.svg) facebookresearch/loop](https://github.com/facebookresearch/loop) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=SkFAWax0-)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;6;8", "confidence": "4;4;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 200, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14159878382438547497&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=SkFAWax0-", "pdf": "https://openreview.net/pdf?id=SkFAWax0-", "email": ";;;", "author_num": 4 }, { "id": "SkFEGHx0Z", "title": "Nearest Neighbour Radial Basis Function Solvers for Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present a radial basis function solver for convolutional neural networks that can be directly applied to both distance metric learning and classification problems. Our method treats all training features from a deep neural network as radial basis function centres and computes loss by summing the influence of a feature's nearby centres in the embedding space. Having a radial basis function centred on each training feature is made scalable by treating it as an approximate nearest neighbour search problem. End-to-end learning of the network and solver is carried out, mapping high dimensional features into clusters of the same class. This results in a well formed embedding space, where semantically related instances are likely to be located near one another, regardless of whether or not the network was trained on those classes. The same loss function is used for both the metric learning and classification problems. We show that our radial basis function solver outperforms state-of-the-art embedding approaches on the Stanford Cars196 and CUB-200-2011 datasets. Additionally, we show that when used as a classifier, our method outperforms a conventional softmax classifier on the CUB-200-2011, Stanford Cars196, Oxford 102 Flowers and Leafsnap fine-grained classification datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Benjamin J. Meyer;Ben Harwood;Tom Drummond", "authorids": "benjamin.meyer@monash.edu;ben.harwood@monash.edu;tom.drummond@monash.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nj.2018nearest,\ntitle={Nearest Neighbour Radial Basis Function Solvers for Deep Neural Networks},\nauthor={Benjamin J. Meyer and Ben Harwood and Tom Drummond},\nyear={2018},\nurl={https://openreview.net/forum?id=SkFEGHx0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SkFEGHx0Z", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4718673152291811372&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Memory Architectures in Recurrent Neural Network Language Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/205", "id": "SkFqf0lAZ", "author_site": "Dani Yogatama, yishu miao, G\u00e1bor Melis, Wang Ling, Adhiguna Kuncoro, Chris Dyer, Phil Blunsom", "tldr": "", "abstract": "We compare and analyze sequential, random access, and stack memory architectures for recurrent neural network language models. Our experiments on the Penn Treebank and Wikitext-2 datasets show that stack-based memory architectures consistently achieve the best performance in terms of held out perplexity. We also propose a generalization to existing continuous stack models (Joulin & Mikolov,2015; Grefenstette et al., 2015) to allow a variable number of pop operations more naturally that further improves performance. We further evaluate these language models in terms of their ability to capture non-local syntactic dependencies on a subject-verb agreement dataset (Linzen et al., 2016) and establish new state of the art results using memory augmented language models. Our results demonstrate the value of stack-structured memory for explaining the distribution of words in natural language, in line with linguistic theories claiming a context-free backbone for natural language.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dani Yogatama;Yishu Miao;Gabor Melis;Wang Ling;Adhiguna Kuncoro;Chris Dyer;Phil Blunsom", "authorids": "dyogatama@google.com;yishu.miao@cs.ox.ac.uk;melisgl@google.com;lingwang@google.com;akuncoro@google.com;cdyer@google.com;pblunsom@google.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nyogatama2018memory,\ntitle={Memory Architectures in Recurrent Neural Network Language Models},\nauthor={Dani Yogatama and Yishu Miao and Gabor Melis and Wang Ling and Adhiguna Kuncoro and Chris Dyer and Phil Blunsom},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SkFqf0lAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "5;6;8", "confidence": "5;3;5", "rating_avg": 6.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 10, "authors#_avg": 7, "corr_rating_confidence": 0.18898223650461363, "gs_citation": 63, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10504418191751664972&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=SkFqf0lAZ", "pdf": "https://openreview.net/pdf?id=SkFqf0lAZ", "email": ";;;;;;", "author_num": 7 }, { "id": "SkFvV0yC-", "title": "Network Iterative Learning for Dynamic Deep Neural Networks via Morphism", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this research, we present a novel learning scheme called network iterative learning for deep neural networks. Different from traditional optimization algorithms that usually optimize directly on a static objective function, we propose in this work to optimize a dynamic objective function in an iterative fashion capable of adapting its function form when being optimized. The optimization is implemented as a series of intermediate neural net functions that is able to dynamically grow into the targeted neural net objective function. This is done via network morphism so that the network knowledge is fully preserved with each network growth. Experimental results demonstrate that the proposed network iterative learning scheme is able to significantly alleviate the degradation problem. Its effectiveness is verified on diverse benchmark datasets.", "keywords": "Network Iterative Learning;Morphism", "primary_area": "", "supplementary_material": "", "author": "Tao Wei;Changhu Wang;Chang Wen Chen", "authorids": "taowei@buffalo.edu;wangchanghu@toutiao.com;chencw@buffalo.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nwei2018network,\ntitle={Network Iterative Learning for Dynamic Deep Neural Networks via Morphism},\nauthor={Tao Wei and Changhu Wang and Chang Wen Chen},\nyear={2018},\nurl={https://openreview.net/forum?id=SkFvV0yC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=SkFvV0yC-", "pdf_size": 0, "rating": "5;5;7", "confidence": "2;3;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.0, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vl-WUz1eeloJ:scholar.google.com/&scioq=Network+Iterative+Learning+for+Dynamic+Deep+Neural+Networks+via+Morphism&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Simulated+Unsupervised Learning With Adaptive Data Generation and Bidirectional Mappings", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/38", "id": "SkHDoG-Cb", "author_site": "Kangwook Lee, Hoon Kim, Changho Suh", "tldr": "", "abstract": "Collecting a large dataset with high quality annotations is expensive and time-consuming. Recently, Shrivastava et al. (2017) propose Simulated+Unsupervised (S+U) learning: It first learns a mapping from synthetic data to real data, translates a large amount of labeled synthetic data to the ones that resemble real data, and then trains a learning model on the translated data. Bousmalis et al. (2017) propose a similar framework that jointly trains a translation mapping and a learning model. \nWhile these algorithms are shown to achieve the state-of-the-art performances on various tasks, it may have a room for improvement, as they do not fully leverage flexibility of data simulation process and consider only the forward (synthetic to real) mapping. While these algorithms are shown to achieve the state-of-the-art performances on various tasks, it may have a room for improvement, as it does not fully leverage flexibility of data simulation process and consider only the forward (synthetic to real) mapping. Inspired by this limitation, we propose a new S+U learning algorithm, which fully leverage the flexibility of data simulators and bidirectional mappings between synthetic data and real data. We show that our approach achieves the improved performance on the gaze estimation task, outperforming (Shrivastava et al., 2017).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kangwook Lee;Hoon Kim;Changho Suh", "authorids": "kw1jjang@gmail.com;gnsrla12@kaist.ac.kr;chsuh@kaist.ac.kr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nlee2018simulatedunsupervised,\ntitle={Simulated+Unsupervised Learning With Adaptive Data Generation and Bidirectional Mappings},\nauthor={Kangwook Lee and Hoon Kim and Changho Suh},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SkHDoG-Cb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "3;6;6", "confidence": "4;4;3", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": -0.5, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17901477215249153215&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=SkHDoG-Cb", "pdf": "https://openreview.net/pdf?id=SkHDoG-Cb", "email": ";;", "author_num": 3 }, { "id": "SkHkeixAW", "title": "Regularization for Deep Learning: A Taxonomy", "track": "main", "status": "Reject", "tldr": "Systematic categorization of regularization methods for deep learning, revealing their similarities.", "abstract": "Regularization is one of the crucial ingredients of deep learning, yet the term regularization has various definitions, and regularization methods are often studied separately from each other. In our work we present a novel, systematic, unifying taxonomy to categorize existing methods. We distinguish methods that affect data, network architectures, error terms, regularization terms, and optimization procedures. We identify the atomic building blocks of existing methods, and decouple the assumptions they enforce from the mathematical tools they rely on. We do not provide all details about the listed methods; instead, we present an overview of how the methods can be sorted into meaningful categories and sub-categories. This helps revealing links and fundamental similarities between them. Finally, we include practical recommendations both for users and for developers of new regularization methods.", "keywords": "neural networks;deep learning;regularization;data augmentation;network architecture;loss function;dropout;residual learning;optimization", "primary_area": "", "supplementary_material": "", "author": "Jan Kuka\u010dka;Vladimir Golkov;Daniel Cremers", "authorids": "jan.kukacka@tum.de;vladimir.golkov@tum.de;cremers@tum.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nkuka\u010dka2018regularization,\ntitle={Regularization for Deep Learning: A Taxonomy},\nauthor={Jan Kuka\u010dka and Vladimir Golkov and Daniel Cremers},\nyear={2018},\nurl={https://openreview.net/forum?id=SkHkeixAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SkHkeixAW", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;4;5", "rating_avg": 4.333333333333333, "confidence_avg": 4.666666666666667, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 566, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15465425579939531365&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "SkHl6MWC-", "title": "Regularization Neural Networks via Constrained Virtual Movement Field", "track": "main", "status": "Workshop", "tldr": "", "abstract": "We provide a novel thinking of regularization neural networks. We smooth the objective of neural networks w.r.t small adversarial perturbations of the inputs. Different from previous works, we assume the adversarial perturbations are caused by the movement field. When the magnitude of movement field approaches 0, we call it virtual movement field. By introducing the movement field, we cast the problem of finding adversarial perturbations into the problem of finding adversarial movement field. By adding proper geometrical constraints to the movement field, such smoothness can be approximated in closed-form by solving a min-max problem and its geometric meaning is clear. We define the approximated smoothness as the regularization term. We derive three regularization terms as running examples which measure the smoothness w.r.t shift, rotation and scale respectively by adding different constraints. We evaluate our methods on synthetic data, MNIST and CIFAR-10. Experimental results show that our proposed method can significantly improve the baseline neural networks. Compared with the state of the art regularization methods, proposed method achieves a tradeoff between accuracy and geometrical interpretability as well as computational cost.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhendong Zhang;Cheolkon Jung", "authorids": "zhd.zhang.ai@gmail.com;zhengzk@xidian.edu.cn", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nzhang2018regularization,\ntitle={Regularization Neural Networks via Constrained Virtual Movement Field},\nauthor={Zhendong Zhang and Cheolkon Jung},\nyear={2018},\nurl={https://openreview.net/forum?id=SkHl6MWC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SkHl6MWC-", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xlJXxEZp0gUJ:scholar.google.com/&scioq=Regularization+Neural+Networks+via+Constrained+Virtual+Movement+Field&hl=en&as_sdt=0,33", "gs_version_total": 3 }, { "id": "SkJKHMW0Z", "title": "Recurrent Relational Networks for complex relational reasoning", "track": "main", "status": "Reject", "tldr": "We introduce Recurrent Relational Networks, a powerful and general neural network module for relational reasoning, and use it to solve 96.6% of the hardest Sudokus and 19/20 BaBi tasks.", "abstract": "Humans possess an ability to abstractly reason about objects and their interactions, an ability not shared with state-of-the-art deep learning models. Relational networks, introduced by Santoro et al. (2017), add the capacity for relational reasoning to deep neural networks, but are limited in the complexity of the reasoning tasks they can address. We introduce recurrent relational networks which increase the suite of solvable tasks to those that require an order of magnitude more steps of relational reasoning. We use recurrent relational networks to solve Sudoku puzzles and achieve state-of-the-art results by solving 96.6% of the hardest Sudoku puzzles, where relational networks fail to solve any. We also apply our model to the BaBi textual QA dataset solving 19/20 tasks which is competitive with state-of-the-art sparse differentiable neural computers. The recurrent relational network is a general purpose module that can augment any neural network model with the capacity to do many-step relational reasoning.", "keywords": "relational reasoning;graph neural networks", "primary_area": "", "supplementary_material": "", "author": "Rasmus Berg Palm;Ulrich Paquet;Ole Winther", "authorids": "rasmusbergpalm@gmail.com;upaq@google.com;olwi@dtu.dk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nberg2018recurrent,\ntitle={Recurrent Relational Networks for complex relational reasoning},\nauthor={Rasmus Berg Palm and Ulrich Paquet and Ole Winther},\nyear={2018},\nurl={https://openreview.net/forum?id=SkJKHMW0Z},\n}", "github": "[![github](/images/github_icon.svg) Kyubyong/sudoku](https://github.com/Kyubyong/sudoku)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SkJKHMW0Z", "pdf_size": 0, "rating": "3;5;5", "confidence": "5;3;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": -1.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "SkJd_y-Cb", "title": "Word2net: Deep Representations of Language", "track": "main", "status": "Reject", "tldr": "Word2net is a novel method for learning neural network representations of words that can use syntactic information to learn better semantic features.", "abstract": "Word embeddings extract semantic features of words from large datasets of text.\nMost embedding methods rely on a log-bilinear model to predict the occurrence\nof a word in a context of other words. Here we propose word2net, a method that\nreplaces their linear parametrization with neural networks. For each term in the\nvocabulary, word2net posits a neural network that takes the context as input and\noutputs a probability of occurrence. Further, word2net can use the hierarchical\norganization of its word networks to incorporate additional meta-data, such as\nsyntactic features, into the embedding model. For example, we show how to share\nparameters across word networks to develop an embedding model that includes\npart-of-speech information. We study word2net with two datasets, a collection\nof Wikipedia articles and a corpus of U.S. Senate speeches. Quantitatively, we\nfound that word2net outperforms popular embedding methods on predicting held-\nout words and that sharing parameters based on part of speech further boosts\nperformance. Qualitatively, word2net learns interpretable semantic representations\nand, compared to vector-based methods, better incorporates syntactic information.", "keywords": "neural language models;word embeddings;neural networks", "primary_area": "", "supplementary_material": "", "author": "Maja Rudolph;Francisco Ruiz;David Blei", "authorids": "marirudolph@gmail.com;f.ruiz@columbia.edu;david.blei@columbia.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nrudolph2018wordnet,\ntitle={Word2net: Deep Representations of Language},\nauthor={Maja Rudolph and Francisco Ruiz and David Blei},\nyear={2018},\nurl={https://openreview.net/forum?id=SkJd_y-Cb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SkJd_y-Cb", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;5", "rating_avg": 4.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zhje7n-lLqsJ:scholar.google.com/&scioq=Word2net:+Deep+Representations+of+Language&hl=en&as_sdt=0,33", "gs_version_total": 2 }, { "id": "SkNQeiRpb", "title": "Training Deep AutoEncoders for Recommender Systems", "track": "main", "status": "Reject", "tldr": "This paper demonstrates how to train deep autoencoders end-to-end to achieve SoA results on time-split Netflix data set.", "abstract": "This paper proposes a new model for the rating prediction task in recommender systems which significantly outperforms previous state-of-the art models on a time-split Netflix data set. Our model is based on deep autoencoder with 6 layers and is trained end-to-end without any layer-wise pre-training. We empirically demonstrate that: a) deep autoencoder models generalize much better than the shallow ones, b) non-linear activation functions with negative parts are crucial for training deep models, and c) heavy use of regularization techniques such as dropout is necessary to prevent over-fitting. We also propose a new training algorithm based on iterative output re-feeding to overcome natural sparseness of collaborate filtering. The new algorithm significantly speeds up training and improves model performance. Our code is publicly available.", "keywords": "autoencoder;recommendations;collaborative filtering;selu", "primary_area": "", "supplementary_material": "", "author": "Oleksii Kuchaiev;Boris Ginsburg", "authorids": "kuchaev@gmail.com;boris.ginsburg@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkuchaiev2018training,\ntitle={Training Deep AutoEncoders for Recommender Systems},\nauthor={Oleksii Kuchaiev and Boris Ginsburg},\nyear={2018},\nurl={https://openreview.net/forum?id=SkNQeiRpb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SkNQeiRpb", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;5;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": -0.18898223650461357, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9664597639338312413&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SkOb1Fl0Z", "title": "A Flexible Approach to Automated RNN Architecture Generation", "track": "main", "status": "Workshop", "tldr": "We define a flexible DSL for RNN architecture generation that allows RNNs of varying size and complexity and propose a ranking function that represents RNNs as recursive neural networks, simulating their performance to decide on the most promising architectures.", "abstract": "The process of designing neural architectures requires expert knowledge and extensive trial and error.\nWhile automated architecture search may simplify these requirements, the recurrent neural network (RNN) architectures generated by existing methods are limited in both flexibility and components.\nWe propose a domain-specific language (DSL) for use in automated architecture search which can produce novel RNNs of arbitrary depth and width.\nThe DSL is flexible enough to define standard architectures such as the Gated Recurrent Unit and Long Short Term Memory and allows the introduction of non-standard RNN components such as trigonometric curves and layer normalization. Using two different candidate generation techniques, random search with a ranking function and reinforcement learning, \nwe explore the novel architectures produced by the RNN DSL for language modeling and machine translation domains.\nThe resulting architectures do not follow human intuition yet perform well on their targeted tasks, suggesting the space of usable RNN architectures is far larger than previously assumed.", "keywords": "reinforcement learning;architecture search;ranking function;recurrent neural networks;recursive neural networks", "primary_area": "", "supplementary_material": "", "author": "Martin Schrimpf;Stephen Merity;James Bradbury;Richard Socher", "authorids": "msch@mit.edu;smerity@smerity.com;james.bradbury@salesforce.com;richard@socher.org", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nschrimpf2018a,\ntitle={A Flexible Approach to Automated {RNN} Architecture Generation},\nauthor={Martin Schrimpf and Stephen Merity and James Bradbury and Richard Socher},\nyear={2018},\nurl={https://openreview.net/forum?id=SkOb1Fl0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SkOb1Fl0Z", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8111960316421570736&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SkPoRg10b", "title": "Rethinking generalization requires revisiting old ideas: statistical mechanics approaches and complex learning behavior", "track": "main", "status": "Reject", "tldr": "Rethinking generalization requires revisiting old ideas: statistical mechanics approaches and complex learning behavior", "abstract": "We describe an approach to understand the peculiar and counterintuitive generalization properties of deep neural networks. The approach involves going beyond worst-case theoretical capacity control frameworks that have been popular in machine learning in recent years to revisit old ideas in the statistical mechanics of neural networks. Within this approach, we present a prototypical Very Simple Deep Learning (VSDL) model, whose behavior is controlled by two control parameters, one describing an effective amount of data, or load, on the network (that decreases when noise is added to the input), and one with an effective temperature interpretation (that increases when algorithms are early stopped). Using this model, we describe how a very simple application of ideas from the statistical mechanics theory of generalization provides a strong qualitative description of recently-observed empirical results regarding the inability of deep neural networks not to overfit training data, discontinuous learning and sharp transitions in the generalization properties of learning algorithms, etc.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Charles H. Martin;Michael W. Mahoney", "authorids": "charles@calculationconsulting.com;mmahoney@stat.berkeley.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nh.2018rethinking,\ntitle={Rethinking generalization requires revisiting old ideas: statistical mechanics approaches and complex learning behavior},\nauthor={Charles H. Martin and Michael W. Mahoney},\nyear={2018},\nurl={https://openreview.net/forum?id=SkPoRg10b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SkPoRg10b", "pdf_size": 0, "rating": "3;6;7", "confidence": "3;5;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 2, "corr_rating_confidence": 0.7205766921228921, "gs_citation": 61, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9857443037434123726&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "SkRsFSRpb", "title": "GeoSeq2Seq: Information Geometric Sequence-to-Sequence Networks", "track": "main", "status": "Workshop", "tldr": "", "abstract": "The Fisher information metric is an important foundation of information geometry, wherein it allows us to approximate the local geometry of a probability distribution. Recurrent neural networks such as the Sequence-to-Sequence (Seq2Seq) networks that have lately been used to yield state-of-the-art performance on speech translation or image captioning have so far ignored the geometry of the latent embedding, that they iteratively learn. We propose the information geometric Seq2Seq (GeoSeq2Seq) network which abridges the gap between deep recurrent neural networks and information geometry. Specifically, the latent embedding offered by a recurrent network is encoded as a Fisher kernel of a parametric Gaussian Mixture Model, a formalism common in computer vision. We utilise such a network to predict the shortest routes between two nodes of a graph by learning the adjacency matrix using the GeoSeq2Seq formalism; our results show that for such a problem the probabilistic representation of the latent embedding supersedes the non-probabilistic embedding by 10-15\\%.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alessandro Bay;Biswa Sengupta", "authorids": "alessandro.bay@cortexica.com;biswasengupta@yahoo.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nbay2018geoseqseq,\ntitle={GeoSeq2Seq: Information Geometric Sequence-to-Sequence Networks},\nauthor={Alessandro Bay and Biswa Sengupta},\nyear={2018},\nurl={https://openreview.net/forum?id=SkRsFSRpb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SkRsFSRpb", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;2", "rating_avg": 4.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18328231735975908644&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "title": "Intrinsic Motivation and Automatic Curricula via Asymmetric Self-Play", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/146", "id": "SkT5Yg-RZ", "author_site": "Sainbayar Sukhbaatar, Zeming Lin, Ilya Kostrikov, Gabriel Synnaeve, Arthur Szlam, Rob Fergus", "tldr": "Unsupervised learning for reinforcement learning using an automatic curriculum of self-play", "abstract": "We describe a simple scheme that allows an agent to learn about its environment in an unsupervised manner. Our scheme pits two versions of the same agent, Alice and Bob, against one another. Alice proposes a task for Bob to complete; and then Bob attempts to complete the task. In this work we will focus on two kinds of environments: (nearly) reversible environments and environments that can be reset. Alice will \"propose\" the task by doing a sequence of actions and then Bob must undo or repeat them, respectively. Via an appropriate reward structure, Alice and Bob automatically generate a curriculum of exploration, enabling unsupervised training of the agent. When Bob is deployed on an RL task within the environment, this unsupervised training reduces the number of supervised episodes needed to learn, and in some cases converges to a higher reward.", "keywords": "self-play;automatic curriculum;intrinsic motivation;unsupervised learning;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Sainbayar Sukhbaatar;Zeming Lin;Ilya Kostrikov;Gabriel Synnaeve;Arthur Szlam;Rob Fergus", "authorids": "sainbar@cs.nyu.edu;zlin@fb.com;kostrikov@cs.nyu.edu;gab@fb.com;aszlam@fb.com;fergus@cs.nyu.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nsukhbaatar2018intrinsic,\ntitle={Intrinsic Motivation and Automatic Curricula via Asymmetric Self-Play},\nauthor={Sainbayar Sukhbaatar and Zeming Lin and Ilya Kostrikov and Gabriel Synnaeve and Arthur Szlam and Rob Fergus},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SkT5Yg-RZ},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=SkT5Yg-RZ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;8;8", "confidence": "3;4;4", "rating_avg": 7.0, "confidence_avg": 3.6666666666666665, "replies_avg": 6, "authors#_avg": 6, "corr_rating_confidence": 1.0, "gs_citation": 464, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4160913521858709316&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=SkT5Yg-RZ", "pdf": "https://openreview.net/pdf?id=SkT5Yg-RZ", "email": ";;;;;", "author_num": 6 }, { "title": "Coulomb GANs: Provably Optimal Nash Equilibria via Potential Fields", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/251", "id": "SkVqXOxCb", "author_site": "Thomas Unterthiner, Bernhard Nessler, Calvin Seward, G\u00fcnter Klambauer, Martin Heusel, Hubert Ramsauer, Sepp Hochreiter", "tldr": "Coulomb GANs can optimally learn a distribution by posing the distribution learning problem as optimizing a potential field", "abstract": "Generative adversarial networks (GANs) evolved into one of the most successful unsupervised techniques for generating realistic images. Even though it has recently been shown that GAN training converges, GAN models often end up in local Nash equilibria that are associated with mode collapse or otherwise fail to model the target distribution. We introduce Coulomb GANs, which pose the GAN learning problem as a potential field, where generated samples are attracted to training set samples but repel each other. The discriminator learns a potential field while the generator decreases the energy by moving its samples along the vector (force) field determined by the gradient of the potential field. Through decreasing the energy, the GAN model learns to generate samples according to the whole target distribution and does not only cover some of its modes. We prove that Coulomb GANs possess only one Nash equilibrium which is optimal in the sense that the model distribution equals the target distribution. We show the efficacy of Coulomb GANs on LSUN bedrooms, CelebA faces, CIFAR-10 and the Google Billion Word text generation.", "keywords": "Deep Learning;Generative Adversarial Network;GAN;Generative Model;Potential Field", "primary_area": "", "supplementary_material": "", "author": "Thomas Unterthiner;Bernhard Nessler;Calvin Seward;G\u00fcnter Klambauer;Martin Heusel;Hubert Ramsauer;Sepp Hochreiter", "authorids": "unterthiner@bioinf.jku.at;nessler@bioinf.jku.at;seward@bioinf.jku.at;klambauer@bioinf.jku.at;mheusel@gmail.com;ramsauer@bioinf.jku.at;hochreit@bioinf.jku.at", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nunterthiner2018coulomb,\ntitle={Coulomb {GAN}s: Provably Optimal Nash Equilibria via Potential Fields},\nauthor={Thomas Unterthiner and Bernhard Nessler and Calvin Seward and G\u00fcnter Klambauer and Martin Heusel and Hubert Ramsauer and Sepp Hochreiter},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SkVqXOxCb},\n}", "github": "[![github](/images/github_icon.svg) bioinf-jku/coulomb_gan](https://github.com/bioinf-jku/coulomb_gan)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;7;7", "confidence": "4;2;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.0, "replies_avg": 14, "authors#_avg": 7, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 84, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14788505867309328713&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=SkVqXOxCb", "pdf": "https://openreview.net/pdf?id=SkVqXOxCb", "email": ";;;;;;", "author_num": 7 }, { "id": "SkYMnLxRW", "title": "Weighted Transformer Network for Machine Translation", "track": "main", "status": "Reject", "tldr": "Using branched attention with learned combination weights outperforms the baseline transformer for machine translation tasks.", "abstract": "State-of-the-art results on neural machine translation often use attentional sequence-to-sequence models with some form of convolution or recursion. Vaswani et. al. (2017) propose a new architecture that avoids recurrence and convolution completely. Instead, it uses only self-attention and feed-forward layers. While the proposed architecture achieves state-of-the-art results on several machine translation tasks, it requires a large number of parameters and training iterations to converge. We propose Weighted Transformer, a Transformer with modified attention layers, that not only outperforms the baseline network in BLEU score but also converges 15-40% faster. Specifically, we replace the multi-head attention by multiple self-attention branches that the model learns to combine during the training process. Our model improves the state-of-the-art performance by 0.5 BLEU points on the WMT 2014 English-to-German translation task and by 0.4 on the English-to-French translation task.", "keywords": "transformer;branching;attention;machine translation", "primary_area": "", "supplementary_material": "", "author": "Karim Ahmed;Nitish Shirish Keskar;Richard Socher", "authorids": "karim.mmm@gmail.com;keskar.nitish@u.northwestern.edu;richard@socher.org", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nahmed2018weighted,\ntitle={Weighted Transformer Network for Machine Translation},\nauthor={Karim Ahmed and Nitish Shirish Keskar and Richard Socher},\nyear={2018},\nurl={https://openreview.net/forum?id=SkYMnLxRW},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 5 community implementations](https://paperswithcode.com/paper/?openreview=SkYMnLxRW)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SkYMnLxRW", "pdf_size": 0, "rating": "4;6;9", "confidence": "4;5;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": -0.11470786693528084, "gs_citation": 174, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13807743807630368021&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SkYXvCR6W", "title": "Compact Encoding of Words for Efficient Character-level Convolutional Neural Networks Text Classification", "track": "main", "status": "Reject", "tldr": "Using Compressing tecniques to Encoding of Words is a possibility for faster training of CNN and dimensionality reduction of representation", "abstract": "This paper puts forward a new text to tensor representation that relies on information compression techniques to assign shorter codes to the most frequently used characters. This representation is language-independent with no need of pretraining and produces an encoding with no information loss. It provides an adequate description of the morphology of text, as it is able to represent prefixes, declensions, and inflections with similar vectors and are able to represent even unseen words on the training dataset. Similarly, as it is compact yet sparse, is ideal for speed up training times using tensor processing libraries. As part of this paper, we show that this technique is especially effective when coupled with convolutional neural networks (CNNs) for text classification at character-level. We apply two variants of CNN coupled with it. Experimental results show that it drastically reduces the number of parameters to be optimized, resulting in competitive classification accuracy values in only a fraction of the time spent by one-hot encoding representations, thus enabling training in commodity hardware.", "keywords": "Character Level Convolutional Networks;Text Classification;Word Compressing", "primary_area": "", "supplementary_material": "", "author": "Wemerson Marinho;Luis Marti;Nayat Sanchez-pi", "authorids": "wemerson_marinho@id.uff.br;lmarti@ic.uff.br;nayat@ime.uerj.br", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmarinho2018compact,\ntitle={Compact Encoding of Words for Efficient Character-level Convolutional Neural Networks Text Classification},\nauthor={Wemerson Marinho and Luis Marti and Nayat Sanchez-pi},\nyear={2018},\nurl={https://openreview.net/forum?id=SkYXvCR6W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SkYXvCR6W", "pdf_size": 0, "rating": "2;3;4", "confidence": "5;5;5", "rating_avg": 3.0, "confidence_avg": 5.0, "replies_avg": 16, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RI9xK4rOgAEJ:scholar.google.com/&scioq=Compact+Encoding+of+Words+for+Efficient+Character-level+Convolutional+Neural+Networks+Text+Classification&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SkYibHlRb", "title": "SQLNet: Generating Structured Queries From Natural Language Without Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Synthesizing SQL queries from natural language is a long-standing open problem and has been attracting considerable interest recently. Toward solving the problem, the de facto approach is to employ a sequence-to-sequence-style model. Such an approach will necessarily require the SQL queries to be serialized. Since the same SQL query may have multiple equivalent serializations, training a sequence-to-sequence-style model is sensitive to the choice from one of them. This phenomenon is documented as the \"order-matters\" problem. Existing state-of-the-art approaches rely on reinforcement learning to reward the decoder when it generates any of the equivalent serializations. However, we observe that the improvement from reinforcement learning is limited.\n \nIn this paper, we propose a novel approach, i.e., SQLNet, to fundamentally solve this problem by avoiding the sequence-to-sequence structure when the order does not matter. In particular, we employ a sketch-based approach where the sketch contains a dependency graph, so that one prediction can be done by taking into consideration only the previous predictions that it depends on. In addition, we propose a sequence-to-set model as well as the column attention mechanism to synthesize the query based on the sketch. By combining all these novel techniques, we show that SQLNet can outperform the prior art by 9% to 13% on the WikiSQL task.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaojun Xu;Chang Liu;Dawn Song", "authorids": "xuxiaojun1005@gmail.com;liuchang@eecs.berkeley.edu;dawnsong@cs.berkeley.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nxu2018sqlnet,\ntitle={{SQLN}et: Generating Structured Queries From Natural Language Without Reinforcement Learning},\nauthor={Xiaojun Xu and Chang Liu and Dawn Song},\nyear={2018},\nurl={https://openreview.net/forum?id=SkYibHlRb},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 13 community implementations](https://paperswithcode.com/paper/?openreview=SkYibHlRb)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SkYibHlRb", "pdf_size": 0, "rating": "4;5;7", "confidence": "5;5;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.666666666666667, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": -0.944911182523068, "gs_citation": 487, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17143697547708916221&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SkZ-BnyCW", "title": "Learning Deep Generative Models With Discrete Latent Variables", "track": "main", "status": "Reject", "tldr": "", "abstract": "There have been numerous recent advancements on learning deep generative models with latent variables thanks to the reparameterization trick that allows to train deep directed models effectively. However, since reparameterization trick only works on continuous variables, deep generative models with discrete latent variables still remain hard to train and perform considerably worse than their continuous counterparts. In this paper, we attempt to shrink this gap by introducing a new architecture and its learning procedure. We develop a hybrid generative model with binary latent variables that consists of an undirected graphical model and a deep neural network. We propose an efficient two-stage pretraining and training procedure that is crucial for learning these models. Experiments on binarized digits and images of natural scenes demonstrate that our model achieves close to the state-of-the-art performance in terms of density estimation and is capable of generating coherent images of natural scenes.", "keywords": "deep generative models;deep learning", "primary_area": "", "supplementary_material": "", "author": "Hengyuan Hu;Ruslan Salakhutdinov", "authorids": "hengyuah@andrew.cmu.edu;rsalakhu@cs.cmu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nhu2018learning,\ntitle={Learning Deep Generative Models With Discrete Latent Variables},\nauthor={Hengyuan Hu and Ruslan Salakhutdinov},\nyear={2018},\nurl={https://openreview.net/forum?id=SkZ-BnyCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SkZ-BnyCW", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=256340665418938184&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Can Neural Networks Understand Logical Entailment?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/175", "id": "SkZxCk-0Z", "author_site": "Richard Evans, David Saxton, David Amos, Pushmeet Kohli, Edward Grefenstette", "tldr": "We introduce a new dataset of logical entailments for the purpose of measuring models' ability to capture and exploit the structure of logical expressions against an entailment prediction task.", "abstract": "We introduce a new dataset of logical entailments for the purpose of measuring models' ability to capture and exploit the structure of logical expressions against an entailment prediction task. We use this task to compare a series of architectures which are ubiquitous in the sequence-processing literature, in addition to a new model class---PossibleWorldNets---which computes entailment as a ``convolution over possible worlds''. Results show that convolutional networks present the wrong inductive bias for this class of problems relative to LSTM RNNs, tree-structured neural networks outperform LSTM RNNs due to their enhanced ability to exploit the syntax of logic, and PossibleWorldNets outperform all benchmarks.", "keywords": "structure;neural networks;logic;dataset", "primary_area": "", "supplementary_material": "", "author": "Richard Evans;David Saxton;David Amos;Pushmeet Kohli;Edward Grefenstette", "authorids": "richardevans@google.com;saxton@google.com;davidamos@google.com;pushmeet@google.com;etg@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nevans2018can,\ntitle={Can Neural Networks Understand Logical Entailment?},\nauthor={Richard Evans and David Saxton and David Amos and Pushmeet Kohli and Edward Grefenstette},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SkZxCk-0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "4;7;7", "confidence": "4;3;3", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 13, "authors#_avg": 5, "corr_rating_confidence": -1.0, "gs_citation": 154, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16989317627315816083&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=SkZxCk-0Z", "pdf": "https://openreview.net/pdf?id=SkZxCk-0Z", "email": ";;;;", "author_num": 5 }, { "id": "SkaPsfZ0W", "title": "Network of Graph Convolutional Networks Trained on Random Walks", "track": "main", "status": "Reject", "tldr": "We make a network of Graph Convolution Networks, feeding each a different power of the adjacency matrix, combining all their representation into a classification sub-network, achieving state-of-the-art on semi-supervised node classification.", "abstract": "Graph Convolutional Networks (GCNs) are a recently proposed architecture which has had success in semi-supervised learning on graph-structured data. At the same time, unsupervised learning of graph embeddings has benefited from the information contained in random walks. In this paper we propose a model, Network of GCNs (N-GCN), which marries these two lines of work. At its core, N-GCN trains multiple instances of GCNs over node pairs discovered at different distances in random walks, and learns a combination of the instance outputs which optimizes the classification objective. Our experiments show that our proposed N-GCN model achieves state-of-the-art performance on all of the challenging node classification tasks we consider: Cora, Citeseer, Pubmed, and PPI. In addition, our proposed method has other desirable properties, including generalization to recently proposed semi-supervised learning methods such as GraphSAGE, allowing us to propose N-SAGE, and resilience to adversarial input perturbations.", "keywords": "Graph Convolution;Deep Learning;Network of Networks", "primary_area": "", "supplementary_material": "", "author": "Sami Abu-El-Haija;Amol Kapoor;Bryan Perozzi;Joonseok Lee", "authorids": "haija@google.com;ajk2227@columbia.edu;bperozzi@acm.org;joonseok@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nabu-el-haija2018network,\ntitle={Network of Graph Convolutional Networks Trained on Random Walks},\nauthor={Sami Abu-El-Haija and Amol Kapoor and Bryan Perozzi and Joonseok Lee},\nyear={2018},\nurl={https://openreview.net/forum?id=SkaPsfZ0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SkaPsfZ0W", "pdf_size": 0, "rating": "5;5;6", "confidence": "2;4;5", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.7559289460184545, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dViwkEvC-BQJ:scholar.google.com/&scioq=Network+of+Graph+Convolutional+Networks+Trained+on+Random+Walks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "A Scalable Laplace Approximation for Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/224", "id": "Skdvd2xAZ", "author_site": "Hippolyt Ritter, Aleksandar Botev, David Barber", "tldr": "We construct a Kronecker factored Laplace approximation for neural networks that leads to an efficient matrix normal distribution over the weights.", "abstract": "We leverage recent insights from second-order optimisation for neural networks to construct a Kronecker factored Laplace approximation to the posterior over the weights of a trained network. Our approximation requires no modification of the training procedure, enabling practitioners to estimate the uncertainty of their models currently used in production without having to retrain them. We extensively compare our method to using Dropout and a diagonal Laplace approximation for estimating the uncertainty of a network. We demonstrate that our Kronecker factored method leads to better uncertainty estimates on out-of-distribution data and is more robust to simple adversarial attacks. Our approach only requires calculating two square curvature factor matrices for each layer. Their size is equal to the respective square of the input and output size of the layer, making the method efficient both computationally and in terms of memory usage. We illustrate its scalability by applying it to a state-of-the-art convolutional network architecture.", "keywords": "deep learning;neural networks;laplace approximation;bayesian deep learning", "primary_area": "", "supplementary_material": "", "author": "Hippolyt Ritter;Aleksandar Botev;David Barber", "authorids": "j.ritter@cs.ucl.ac.uk;botevmg@gmail.com;d.barber@cs.ucl.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nritter2018a,\ntitle={A Scalable Laplace Approximation for Neural Networks},\nauthor={Hippolyt Ritter and Aleksandar Botev and David Barber},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Skdvd2xAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;9", "confidence": "4;4;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 528, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3068639073703398000&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=Skdvd2xAZ", "pdf": "https://openreview.net/pdf?id=Skdvd2xAZ", "email": ";;", "author_num": 3 }, { "id": "SkeR2JURZ", "title": "Distributed Restarting NewtonCG Method for Large-Scale Empirical Risk Minimization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this paper, we propose a distributed damped Newton method in which sample size is gradually increasing to quickly obtain a solution whose empirical loss is under satisfactory statistical accuracy. Our proposed method is multistage in which the solution of one stage serves as a warm start for the next stage which contains more samples (including the samples in the previous stage). This overall multistage algorithm reduce the number of passes over data. Moreover, our algorithm in nature is easy to be distributed and shares the strong scaling property indicating that acceleration is always expected by using more computing nodes. Various iteration complexity results regarding descent direction computation and stopping criteria are analyzed under convex setting. Our results of experiments illustrate that the proposed algorithm can outperform other comparable methods for training machine learning tasks including neural networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Majid Jahani;Xi He;Chenxin Ma;Dheevatsa Mudigere;Aryan Mokhtari;Alejandro Ribeiro;Martin Takac", "authorids": "maj314@lehigh.edu;xih314@lehigh.edu;chm514@lehigh.edu;dheevatsa.mudigere@intel.com;aryanm@seas.upenn.edu;aribeiro@seas.upenn.edu;martin.taki@gmail.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=SkeR2JURZ", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 0, "authors#_avg": 7, "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:mJtGxX_zHZwJ:scholar.google.com/&scioq=Distributed+Restarting+NewtonCG+Method+for+Large-Scale+Empirical+Risk+Minimization&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SkfNU2e0Z", "title": "Statestream: A toolbox to explore layerwise-parallel deep neural networks", "track": "main", "status": "Reject", "tldr": "We define a concept of layerwise model-parallel deep neural networks, for which layers operate in parallel, and provide a toolbox to design, train, evaluate, and on-line interact with these networks.", "abstract": "Building deep neural networks to control autonomous agents which have to interact in real-time with the physical world, such as robots or automotive vehicles, requires a seamless integration of time into a network\u2019s architecture. The central question of this work is, how the temporal nature of reality should be reflected in the execution of a deep neural network and its components. Most artificial deep neural networks are partitioned into a directed graph of connected modules or layers and the layers themselves consist of elemental building blocks, such as single units. For most deep neural networks, all units of a layer are processed synchronously and in parallel, but layers themselves are processed in a sequential manner. In contrast, all elements of a biological neural network are processed in parallel. In this paper, we define a class of networks between these two extreme cases. These networks are executed in a streaming or synchronous layerwise-parallel manner, unlocking the layers of such networks for parallel processing. Compared to the standard layerwise-sequential deep networks, these new layerwise-parallel networks show a fundamentally different temporal behavior and flow of information, especially for networks with skip or recurrent connections. We argue that layerwise-parallel deep networks are better suited for future challenges of deep neural network design, such as large functional modularized and/or recurrent architectures as well as networks allocating different network capacities dependent on current stimulus and/or task complexity. We layout basic properties and discuss major challenges for layerwise-parallel networks. Additionally, we provide a toolbox to design, train, evaluate, and online-interact with layerwise-parallel networks.", "keywords": "model-parallel;parallelization;software platform", "primary_area": "", "supplementary_material": "", "author": "Volker Fischer", "authorids": "volker.fischer@de.bosch.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nfischer2018statestream,\ntitle={Statestream: A toolbox to explore layerwise-parallel deep neural networks},\nauthor={Volker Fischer},\nyear={2018},\nurl={https://openreview.net/forum?id=SkfNU2e0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SkfNU2e0Z", "pdf_size": 0, "rating": "3;5;5", "confidence": "4;3;4", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 1, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:30x8NOuLTQ8J:scholar.google.com/&scioq=Statestream:+A+toolbox+to+explore+layerwise-parallel+deep+neural+networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SkffVjUaW", "title": "Building effective deep neural networks one feature at a time", "track": "main", "status": "Reject", "tldr": "A bottom-up algorithm that expands CNNs starting with one feature per layer to architectures with sufficient representational capacity.", "abstract": "Successful training of convolutional neural networks is often associated with suffi-\nciently deep architectures composed of high amounts of features. These networks\ntypically rely on a variety of regularization and pruning techniques to converge\nto less redundant states. We introduce a novel bottom-up approach to expand\nrepresentations in fixed-depth architectures. These architectures start from just a\nsingle feature per layer and greedily increase width of individual layers to attain\neffective representational capacities needed for a specific task. While network\ngrowth can rely on a family of metrics, we propose a computationally efficient\nversion based on feature time evolution and demonstrate its potency in determin-\ning feature importance and a networks\u2019 effective capacity. We demonstrate how\nautomatically expanded architectures converge to similar topologies that benefit\nfrom lesser amount of parameters or improved accuracy and exhibit systematic\ncorrespondence in representational complexity with the specified task. In contrast\nto conventional design patterns with a typical monotonic increase in the amount of\nfeatures with increased depth, we observe that CNNs perform better when there is\nmore learnable parameters in intermediate, with falloffs to earlier and later layers.", "keywords": "convolution neural networks;architecture search;meta-learning;representational capacity", "primary_area": "", "supplementary_material": "", "author": "Martin Mundt;Tobias Weis;Kishore Konda;Visvanathan Ramesh", "authorids": "mundt@fias.uni-frankfurt.de;weis@ccc.cs.uni-frankfurt.de;kishore.konda@insofe.edu.in;ramesh@fias.uni-frankfurt.de", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nmundt2018building,\ntitle={Building effective deep neural networks one feature at a time},\nauthor={Martin Mundt and Tobias Weis and Kishore Konda and Visvanathan Ramesh},\nyear={2018},\nurl={https://openreview.net/forum?id=SkffVjUaW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SkffVjUaW", "pdf_size": 0, "rating": "4;5;8", "confidence": "5;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": -0.6933752452815364, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YvJK0S9H6GUJ:scholar.google.com/&scioq=Building+effective+deep+neural+networks+one+feature+at+a+time&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Deep Gradient Compression: Reducing the Communication Bandwidth for Distributed Training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/68", "id": "SkhQHMW0W", "author_site": "Yujun Lin, song han, , Yu Wang, Bill Dally", "tldr": "we find 99.9% of the gradient exchange in distributed SGD is redundant; we reduce the communication bandwidth by two orders of magnitude without losing accuracy. ", "abstract": "Large-scale distributed training requires significant communication bandwidth for gradient exchange that limits the scalability of multi-node training, and requires expensive high-bandwidth network infrastructure. The situation gets even worse with distributed training on mobile devices (federated learning), which suffers from higher latency, lower throughput, and intermittent poor connections. In this paper, we find 99.9% of the gradient exchange in distributed SGD is redundant, and propose Deep Gradient Compression (DGC) to greatly reduce the communication bandwidth. To preserve accuracy during compression, DGC employs four methods: momentum correction, local gradient clipping, momentum factor masking, and warm-up training. We have applied Deep Gradient Compression to image classification, speech recognition, and language modeling with multiple datasets including Cifar10, ImageNet, Penn Treebank, and Librispeech Corpus. On these scenarios, Deep Gradient Compression achieves a gradient compression ratio from 270x to 600x without losing accuracy, cutting the gradient size of ResNet-50 from 97MB to 0.35MB, and for DeepSpeech from 488MB to 0.74MB. Deep gradient compression enables large-scale distributed training on inexpensive commodity 1Gbps Ethernet and facilitates distributed training on mobile.", "keywords": "distributed training", "primary_area": "", "supplementary_material": "", "author": "Yujun Lin;Song Han;Huizi Mao;Yu Wang;Bill Dally", "authorids": "yujunlin@stanford.edu;songhan@stanford.edu;huizi@stanford.edu;yu-wang@mail.tsinghua.edu.cn;dally@stanford.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nlin2018deep,\ntitle={Deep Gradient Compression: Reducing the Communication Bandwidth for Distributed Training},\nauthor={Yujun Lin and Song Han and Huizi Mao and Yu Wang and Bill Dally},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SkhQHMW0W},\n}", "github": "[![github](/images/github_icon.svg) synxlin/deep-gradient-compression](https://github.com/synxlin/deep-gradient-compression) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=SkhQHMW0W)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "5;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 40, "authors#_avg": 5, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 1771, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2485379403852124678&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=SkhQHMW0W", "pdf": "https://openreview.net/pdf?id=SkhQHMW0W", "email": ";;;;", "author_num": 5 }, { "id": "SkiCjzNTZ", "title": "Spontaneous Symmetry Breaking in Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "Closed form results for deep learning in the layer decoupling limit applicable to Residual Networks", "abstract": "We propose a framework to understand the unprecedented performance and robustness of deep neural networks using field theory. Correlations between the weights within the same layer can be described by symmetries in that layer, and networks generalize better if such symmetries are broken to reduce the redundancies of the weights. Using a two parameter field theory, we find that the network can break such symmetries itself towards the end of training in a process commonly known in physics as spontaneous symmetry breaking. This corresponds to a network generalizing itself without any user input layers to break the symmetry, but by communication with adjacent layers. In the layer decoupling limit applicable to residual networks (He et al., 2015), we show that the remnant symmetries that survive the non-linear layers are spontaneously broken based on empirical results. The Lagrangian for the non-linear and weight layers together has striking similarities with the one in quantum field theory of a scalar. Using results from quantum field theory we show that our framework is able to explain many experimentally observed phenomena, such as training on random labels with zero error (Zhang et al., 2017), the information bottleneck and the phase transition out of it (Shwartz-Ziv & Tishby, 2017), shattered gradients (Balduzzi et al., 2017), and many more.", "keywords": "deep learning;physics;field theory", "primary_area": "", "supplementary_material": "", "author": "Ricky Fok;Aijun An;Xiaogang Wang", "authorids": "ricky.fok3@gmail.com;ricky.fok3@gmail.com;ricky.fok3@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nfok2018spontaneous,\ntitle={Spontaneous Symmetry Breaking in Deep Neural Networks},\nauthor={Ricky Fok and Aijun An and Xiaogang Wang},\nyear={2018},\nurl={https://openreview.net/forum?id=SkiCjzNTZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SkiCjzNTZ", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;4;3", "rating_avg": 3.0, "confidence_avg": 3.3333333333333335, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15656933699866907973&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Stabilizing Adversarial Nets with Prediction Methods", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/213", "id": "Skj8Kag0Z", "author_site": "Abhay Kumar Yadav, Sohil Shah, Zheng Xu, David Jacobs, Tom Goldstein", "tldr": "We present a simple modification to the alternating SGD method, called a prediction step, that improves the stability of adversarial networks.", "abstract": "Adversarial neural networks solve many important problems in data science, but are notoriously difficult to train. These difficulties come from the fact that optimal weights for adversarial nets correspond to saddle points, and not minimizers, of the loss function. The alternating stochastic gradient methods typically used for such problems do not reliably converge to saddle points, and when convergence does happen it is often highly sensitive to learning rates. We propose a simple modification of stochastic gradient descent that stabilizes adversarial networks. We show, both in theory and practice, that the proposed method reliably converges to saddle points. This makes adversarial networks less likely to \"collapse,\" and enables faster training with larger learning rates.", "keywords": "adversarial networks;optimization", "primary_area": "", "supplementary_material": "", "author": "Abhay Yadav;Sohil Shah;Zheng Xu;David Jacobs;Tom Goldstein", "authorids": "jaiabhay@cs.umd.edu;sohilas@umd.edu;xuzh@cs.umd.edu;djacobs@umiacs.umd.edu;tomg@cs.umd.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nyadav2018stabilizing,\ntitle={Stabilizing Adversarial Nets with Prediction Methods},\nauthor={Abhay Yadav and Sohil Shah and Zheng Xu and David Jacobs and Tom Goldstein},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Skj8Kag0Z},\n}", "github": "[![github](/images/github_icon.svg) jaiabhayk/stableGAN](https://github.com/jaiabhayk/stableGAN)", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "4;7;9", "confidence": "4;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 19, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 121, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1304972437215881711&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=Skj8Kag0Z", "pdf": "https://openreview.net/pdf?id=Skj8Kag0Z", "email": ";;;;", "author_num": 5 }, { "id": "Skk3Jm96W", "title": "Some Considerations on Learning to Explore via Meta-Reinforcement Learning", "track": "main", "status": "Workshop", "tldr": "Modifications to MAML and RL2 that should allow for better exploration. ", "abstract": "We consider the problem of exploration in meta reinforcement learning. Two new meta reinforcement learning algorithms are suggested: E-MAML and ERL2. Results are presented on a novel environment we call 'Krazy World' and a set of maze environments. We show E-MAML and ERL2 deliver better performance on tasks where exploration is important.", "keywords": "reinforcement learning;rl;exploration;meta learning;meta reinforcement learning;curiosity", "primary_area": "", "supplementary_material": "", "author": "Bradly Stadie;Ge Yang;Rein Houthooft;Xi Chen;Yan Duan;Yuhuai Wu;Pieter Abbeel;Ilya Sutskever", "authorids": "bstadie@berkeley.edu;yangge1987@gmail.com;rein.hh@gmail.com;adslcx@gmail.com;dementrock@gmail.com;ywu@cs.toronto.edu;pabbeel@gmail.com;ilyasu@openai.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\nstadie2018some,\ntitle={Some Considerations on Learning to Explore via Meta-Reinforcement Learning},\nauthor={Bradly Stadie and Ge Yang and Rein Houthooft and Xi Chen and Yan Duan and Yuhuai Wu and Pieter Abbeel and Ilya Sutskever},\nyear={2018},\nurl={https://openreview.net/forum?id=Skk3Jm96W},\n}", "github": "[![github](/images/github_icon.svg) episodeyang/e-maml](https://github.com/episodeyang/e-maml) + [![Papers with Code](/images/pwc_icon.svg) 6 community implementations](https://paperswithcode.com/paper/?openreview=Skk3Jm96W)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Skk3Jm96W", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;5;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 12, "authors#_avg": 8, "corr_rating_confidence": 0.18898223650461357, "gs_citation": 151, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4346250006714927023&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "SkmM6M_pW", "title": "Egocentric Spatial Memory Network", "track": "main", "status": "Reject", "tldr": "first deep neural network for modeling Egocentric Spatial Memory inspired by neurophysiological discoveries of navigation cells in mammalian brain", "abstract": "Inspired by neurophysiological discoveries of navigation cells in the mammalian\nbrain, we introduce the first deep neural network architecture for modeling Egocentric\nSpatial Memory (ESM). It learns to estimate the pose of the agent and\nprogressively construct top-down 2D global maps from egocentric views in a spatially\nextended environment. During the exploration, our proposed ESM network\nmodel updates belief of the global map based on local observations using a recurrent\nneural network. It also augments the local mapping with a novel external\nmemory to encode and store latent representations of the visited places based on\ntheir corresponding locations in the egocentric coordinate. This enables the agents\nto perform loop closure and mapping correction. This work contributes in the\nfollowing aspects: first, our proposed ESM network provides an accurate mapping\nability which is vitally important for embodied agents to navigate to goal locations.\nIn the experiments, we demonstrate the functionalities of the ESM network in\nrandom walks in complicated 3D mazes by comparing with several competitive\nbaselines and state-of-the-art Simultaneous Localization and Mapping (SLAM)\nalgorithms. Secondly, we faithfully hypothesize the functionality and the working\nmechanism of navigation cells in the brain. Comprehensive analysis of our model\nsuggests the essential role of individual modules in our proposed architecture and\ndemonstrates efficiency of communications among these modules. We hope this\nwork would advance research in the collaboration and communications over both\nfields of computer science and computational neuroscience.", "keywords": "spatial memory;egocentric vision;deep neural network;navigation", "primary_area": "", "supplementary_material": "", "author": "Mengmi Zhang;Keng Teck Ma;Joo Hwee Lim;Shih-Cheng Yen;Qi Zhao;Jiashi Feng", "authorids": "a0091624@u.nus.edu;makt@i2r.a-star.edu.sg;joohwee@i2r.a-star.edu.sg;shihcheng@nus.edu.sg;qzhao@cs.umn.edu;elefjia@nus.edu.sg", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nzhang2018egocentric,\ntitle={Egocentric Spatial Memory Network},\nauthor={Mengmi Zhang and Keng Teck Ma and Joo Hwee Lim and Shih-Cheng Yen and Qi Zhao and Jiashi Feng},\nyear={2018},\nurl={https://openreview.net/forum?id=SkmM6M_pW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SkmM6M_pW", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "SkmiegW0b", "title": "Challenges in Disentangling Independent Factors of Variation", "track": "main", "status": "Workshop", "tldr": "It is a mostly theoretical paper that describes the challenges in disentangling factors of variation, using autoencoders and GAN.", "abstract": "\nWe study the problem of building models that disentangle independent factors of variation. Such models encode features that can efficiently be used for classification and to transfer attributes between different images in image synthesis. As data we use a weakly labeled training set, where labels indicate what single factor has changed between two data samples, although the relative value of the change is unknown. This labeling is of particular interest as it may be readily available without annotation costs. We introduce an autoencoder model and train it through constraints on image pairs and triplets. We show the role of feature dimensionality and adversarial training theoretically and experimentally. We formally prove the existence of the reference ambiguity, which is inherently present in the disentangling task when weakly labeled data is used. The numerical value of a factor has different meaning in different reference frames. When the reference depends on other factors, transferring that factor becomes ambiguous. We demonstrate experimentally that the proposed model can successfully transfer attributes on several datasets, but show also cases when the reference ambiguity occurs.\n", "keywords": "disentangling;factors;attribute;transfer;autoencoder;GAN", "primary_area": "", "supplementary_material": "", "author": "Attila Szabo;Qiyang Hu;Tiziano Portenier;Matthias Zwicker;Paolo Favaro", "authorids": "szabo@inf.unibe.ch;hu@inf.unibe.ch;portenier@inf.unibe.ch;zwicker@inf.unibe.ch;paolo.favaro@inf.unibe.ch", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\n2018challenges,\ntitle={Challenges in Disentangling Independent Factors of Variation},\nauthor={Attila Szabo and Qiyang Hu and Tiziano Portenier and Matthias Zwicker and Paolo Favaro},\nyear={2018},\nurl={https://openreview.net/forum?id=SkmiegW0b},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=SkmiegW0b)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SkmiegW0b", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;3;4", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 7, "authors#_avg": 5, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 63, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1225796184074177533&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SknC0bW0-", "title": "Continuous-fidelity Bayesian Optimization with Knowledge Gradient", "track": "main", "status": "Reject", "tldr": "We propose a Bayes-optimal Bayesian optimization algorithm for hyperparameter tuning by exploiting cheap approximations.", "abstract": "While Bayesian optimization (BO) has achieved great success in optimizing expensive-to-evaluate black-box functions, especially tuning hyperparameters of neural networks, methods such as random search (Li et al., 2016) and multi-fidelity BO (e.g. Klein et al. (2017)) that exploit cheap approximations, e.g. training on a smaller training data or with fewer iterations, can outperform standard BO approaches that use only full-fidelity observations. In this paper, we propose a novel Bayesian optimization algorithm, the continuous-fidelity knowledge gradient (cfKG) method, that can be used when fidelity is controlled by one or more continuous settings such as training data size and the number of training iterations. cfKG characterizes the value of the information gained by sampling a point at a given fidelity, choosing to sample at the point and fidelity with the largest value per unit cost. Furthermore, cfKG can be generalized, following Wu et al. (2017), to settings where derivatives are available in the optimization process, e.g. large-scale kernel learning, and where more than one point can be evaluated simultaneously. Numerical experiments show that cfKG outperforms state-of-art algorithms when optimizing synthetic functions, tuning convolutional neural networks (CNNs) on CIFAR-10 and SVHN, and in large-scale kernel learning.", "keywords": "Continuous fidelity;Bayesian optimization;fast;knowledge gradient;hyperparameter optimization", "primary_area": "", "supplementary_material": "", "author": "Jian Wu;Peter I. Frazier", "authorids": "jw926@cornell.edu;pf98@cornell.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nwu2018continuousfidelity,\ntitle={Continuous-fidelity Bayesian Optimization with Knowledge Gradient},\nauthor={Jian Wu and Peter I. Frazier},\nyear={2018},\nurl={https://openreview.net/forum?id=SknC0bW0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SknC0bW0-", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;4;5", "rating_avg": 5.0, "confidence_avg": 4.666666666666667, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9039165433560014484&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "title": "Towards Synthesizing Complex Programs From Input-Output Examples", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/269", "id": "Skp1ESxRZ", "author_site": "Xinyun Chen, Chang Liu, Dawn Song", "tldr": "", "abstract": "In recent years, deep learning techniques have been developed to improve the performance of program synthesis from input-output examples. Albeit its significant progress, the programs that can be synthesized by state-of-the-art approaches are still simple in terms of their complexity. In this work, we move a significant step forward along this direction by proposing a new class of challenging tasks in the domain of program synthesis from input-output examples: learning a context-free parser from pairs of input programs and their parse trees. We show that this class of tasks are much more challenging than previously studied tasks, and the test accuracy of existing approaches is almost 0%.\n\nWe tackle the challenges by developing three novel techniques inspired by three novel observations, which reveal the key ingredients of using deep learning to synthesize a complex program. First, the use of a non-differentiable machine is the key to effectively restrict the search space. Thus our proposed approach learns a neural program operating a domain-specific non-differentiable machine. Second, recursion is the key to achieve generalizability. Thus, we bake-in the notion of recursion in the design of our non-differentiable machine. Third, reinforcement learning is the key to learn how to operate the non-differentiable machine, but it is also hard to train the model effectively with existing reinforcement learning algorithms from a cold boot. We develop a novel two-phase reinforcement learning-based search algorithm to overcome this issue. In our evaluation, we show that using our novel approach, neural parsing programs can be learned to achieve 100% test accuracy on test inputs that are 500x\u0002 longer than the training samples.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinyun Chen;Chang Liu;Dawn Song", "authorids": "xinyun.chen@berkeley.edu;liuchang@eecs.berkeley.edu;dawnsong.travel@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nchen2018towards,\ntitle={Towards Synthesizing Complex Programs From Input-Output Examples},\nauthor={Xinyun Chen and Chang Liu and Dawn Song},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Skp1ESxRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;7;8", "confidence": "2;4;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.0, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": 0.6546536707079772, "gs_citation": 67, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14446769365288800858&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=Skp1ESxRZ", "pdf": "https://openreview.net/pdf?id=Skp1ESxRZ", "email": ";;", "author_num": 3 }, { "id": "SkqV-XZRZ", "title": "Variational Bi-LSTMs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recurrent neural networks like long short-term memory (LSTM) are important architectures for sequential prediction tasks. LSTMs (and RNNs in general) model sequences along the forward time direction. Bidirectional LSTMs (Bi-LSTMs), which model sequences along both forward and backward directions, generally perform better at such tasks because they capture a richer representation of the data. In the training of Bi-LSTMs, the forward and backward paths are learned independently. We propose a variant of the Bi-LSTM architecture, which we call Variational Bi-LSTM, that creates a dependence between the two paths (during training, but which may be omitted during inference). Our model acts as a regularizer and encourages the two networks to inform each other in making their respective predictions using distinct information. We perform ablation studies to better understand the different components of our model and evaluate the method on various benchmarks, showing state-of-the-art performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Samira Shabanian;Devansh Arpit;Adam Trischler;Yoshua Bengio", "authorids": "s.shabanian@gmail.com;devansharpit@gmail.com;adam.trischler@microsoft.com;yoshua.umontreal@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nshabanian2018variational,\ntitle={Variational Bi-{LSTM}s},\nauthor={Samira Shabanian and Devansh Arpit and Adam Trischler and Yoshua Bengio},\nyear={2018},\nurl={https://openreview.net/forum?id=SkqV-XZRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SkqV-XZRZ", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;4;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 15, "authors#_avg": 4, "corr_rating_confidence": -0.7559289460184545, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15078175515550044991&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "SkrHeXbCW", "title": "Learning Representations for Faster Similarity Search", "track": "main", "status": "Reject", "tldr": "We show how to get good representations from the point of view of Simiarity Search.", "abstract": "In high dimensions, the performance of nearest neighbor algorithms depends crucially on structure in the data.\nWhile traditional nearest neighbor datasets consisted mostly of hand-crafted feature vectors, an increasing number of datasets comes from representations learned with neural networks.\nWe study the interaction between nearest neighbor algorithms and neural networks in more detail.\nWe find that the network architecture can significantly influence the efficacy of nearest neighbor algorithms even when the classification accuracy is unchanged.\nBased on our experiments, we propose a number of training modifications that lead to significantly better datasets for nearest neighbor algorithms.\nOur modifications lead to learned representations that can accelerate nearest neighbor queries by 5x.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ludwig Schmidt;Kunal Talwar", "authorids": "ludwigs@mit.edu;kunal@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nschmidt2018learning,\ntitle={Learning Representations for Faster Similarity Search},\nauthor={Ludwig Schmidt and Kunal Talwar},\nyear={2018},\nurl={https://openreview.net/forum?id=SkrHeXbCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SkrHeXbCW", "pdf_size": 0, "rating": "4;4;4", "confidence": "5;4;5", "rating_avg": 4.0, "confidence_avg": 4.666666666666667, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3AzR4eZFfXcJ:scholar.google.com/&scioq=Learning+Representations+for+Faster+Similarity+Search&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SksY3deAW", "title": "Learning Deep ResNet Blocks Sequentially using Boosting Theory", "track": "main", "status": "Reject", "tldr": "We prove a multiclass boosting theory for the ResNet architectures which simultaneously creates a new technique for multiclass boosting and provides a new algorithm for ResNet-style architectures.", "abstract": "We prove a multiclass boosting theory for the ResNet architectures which simultaneously creates a new technique for multiclass boosting and provides a new algorithm for ResNet-style architectures. Our proposed training algorithm, BoostResNet, is particularly suitable in non-differentiable architectures. Our method only requires the relatively inexpensive sequential training of T \"shallow ResNets\". We prove that the training error decays exponentially with the depth T if the weak module classifiers that we train perform slightly better than some weak baseline. In other words, we propose a weak learning condition and prove a boosting theory for ResNet under the weak learning condition. A generalization error bound based on margin theory is proved and suggests that ResNet could be resistant to overfitting using a network with l_1 norm bounded weights.", "keywords": "residual network;boosting theory;training error guarantee", "primary_area": "", "supplementary_material": "", "author": "Furong Huang;Jordan T. Ash;John Langford;Robert E. Schapire", "authorids": "furongh@cs.umd.edu;jordantash@gmail.com;jcl@microsoft.com;schapire@microsoft.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nhuang2018learning,\ntitle={Learning Deep ResNet Blocks Sequentially using Boosting Theory},\nauthor={Furong Huang and Jordan T. Ash and John Langford and Robert E. Schapire},\nyear={2018},\nurl={https://openreview.net/forum?id=SksY3deAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=SksY3deAW", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 136, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10501544880463084863&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "SktLlGbRZ", "title": "CyCADA: Cycle-Consistent Adversarial Domain Adaptation", "track": "main", "status": "Reject", "tldr": "An unsupervised domain adaptation approach which adapts at both the pixel and feature levels", "abstract": "Domain adaptation is critical for success in new, unseen environments.\nAdversarial adaptation models applied in feature spaces discover domain invariant representations, but are difficult to visualize and sometimes fail to capture pixel-level and low-level domain shifts.\nRecent work has shown that generative adversarial networks combined with cycle-consistency constraints are surprisingly effective at mapping images between domains, even without the use of aligned image pairs.\nWe propose a novel discriminatively-trained Cycle-Consistent Adversarial Domain Adaptation model.\nCyCADA adapts representations at both the pixel-level and feature-level, enforces cycle-consistency while leveraging a task loss, and does not require aligned pairs. Our model can be applied in a variety of visual recognition and prediction settings.\nWe show new state-of-the-art results across multiple adaptation tasks, including digit classification and semantic segmentation of road scenes demonstrating transfer from synthetic to real world domains.", "keywords": "domain adaptation;unsupervised learning;classification;semantic segmentation", "primary_area": "", "supplementary_material": "", "author": "Judy Hoffman;Eric Tzeng;Taesung Park;Jun-Yan Zhu;Phillip Isola;Kate Saenko;Alyosha Efros;Trevor Darrell", "authorids": "jhoffman@eecs.berkeley.edu;etzeng@eecs.berkeley.edu;taesung_park@berkeley.edu;junyanz@berkeley.edu;isola@eecs.berkeley.edu;saenko@bu.edu;efros@eecs.berkeley.edu;trevor@eecs.berkeley.edu", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\nhoffman2018cycada,\ntitle={Cy{CADA}: Cycle-Consistent Adversarial Domain Adaptation},\nauthor={Judy Hoffman and Eric Tzeng and Taesung Park and Jun-Yan Zhu and Phillip Isola and Kate Saenko and Alyosha Efros and Trevor Darrell},\nyear={2018},\nurl={https://openreview.net/forum?id=SktLlGbRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SktLlGbRZ", "pdf_size": 0, "rating": "5;5;9", "confidence": "5;5;5", "rating_avg": 6.333333333333333, "confidence_avg": 5.0, "replies_avg": 11, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 3787, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13169730024102659375&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "Sktm4zWRb", "title": "Soft Value Iteration Networks for Planetary Rover Path Planning", "track": "main", "status": "Reject", "tldr": "We propose an improvement to value iteration networks, with applications to planetary rover path planning.", "abstract": "Value iteration networks are an approximation of the value iteration (VI) algorithm implemented with convolutional neural networks to make VI fully differentiable. In this work, we study these networks in the context of robot motion planning, with a focus on applications to planetary rovers. The key challenging task in learning-based motion planning is to learn a transformation from terrain observations to a suitable navigation reward function. In order to deal with complex terrain observations and policy learning, we propose a value iteration recurrence, referred to as the soft value iteration network (SVIN). SVIN is designed to produce more effective training gradients through the value iteration network. It relies on a soft policy model, where the policy is represented with a probability distribution over all possible actions, rather than a deterministic policy that returns only the best action. We demonstrate the effectiveness of the proposed method in robot motion planning scenarios. In particular, we study the application of SVIN to very challenging problems in planetary rover navigation and present early training results on data gathered by the Curiosity rover that is currently operating on Mars.", "keywords": "value iteration networks;robotics;space robotics;imitation learning;convolutional neural networks;path planning", "primary_area": "", "supplementary_material": "", "author": "Max Pflueger;Ali Agha;Gaurav S. Sukhatme", "authorids": "mpflueger@gmail.com;aliahga@jpl.nasa.gov;gaurav@usc.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\npflueger2018soft,\ntitle={Soft Value Iteration Networks for Planetary Rover Path Planning},\nauthor={Max Pflueger and Ali Agha and Gaurav S. Sukhatme},\nyear={2018},\nurl={https://openreview.net/forum?id=Sktm4zWRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Sktm4zWRb", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;3;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13451973776368141521&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "Skvd-myR-", "title": "Learning Non-Metric Visual Similarity for Image Retrieval", "track": "main", "status": "Reject", "tldr": "Similarity network to learn a non-metric visual similarity estimation between a pair of images", "abstract": "Measuring visual (dis)similarity between two or more instances within a data distribution is a fundamental task in many applications, specially in image retrieval. Theoretically, non-metric distances are able to generate a more complex and accurate similarity model than metric distances, provided that the non-linear data distribution is precisely captured by the similarity model. In this work, we analyze a simple approach for deep learning networks to be used as an approximation of non-metric similarity functions and we study how these models generalize across different image retrieval datasets.", "keywords": "image retrieval;visual similarity;non-metric learning", "primary_area": "", "supplementary_material": "", "author": "Noa Garcia;George Vogiatzis", "authorids": "garciadn@aston.ac.uk;g.vogiatzis@aston.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ngarcia2018learning,\ntitle={Learning Non-Metric Visual Similarity for Image Retrieval},\nauthor={Noa Garcia and George Vogiatzis},\nyear={2018},\nurl={https://openreview.net/forum?id=Skvd-myR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Skvd-myR-", "pdf_size": 0, "rating": "3;4;7", "confidence": "5;4;5", "rating_avg": 4.666666666666667, "confidence_avg": 4.666666666666667, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": 0.2773500981126146, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1725821054915160783&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11 }, { "id": "Skvin0GWM", "title": "Human-like Clustering with Deep Convolutional Neural Networks", "track": "main", "status": "Withdraw", "tldr": "Human-like Clustering with CNNs", "abstract": "Classification and clustering have been studied separately in machine learning and computer vision. Inspired by the recent success of deep learning models in solving various vision problems (e.g., object recognition, semantic segmentation) and the fact that humans serve as the gold standard in assessing clustering algorithms, here, we advocate for a unified treatment of the two problems and suggest that hierarchical frameworks that progressively build complex patterns on top of the simpler ones (e.g., convolutional neural networks) offer a promising solution. We do not dwell much on the learning mechanisms in these frameworks as they are still a matter of debate, with respect to biological constraints. Instead, we emphasize on the compositionality of the real world structures and objects. In particular, we show that CNNs, trained end to end using back propagation with noisy labels, are able to cluster data points belonging to several overlapping shapes, and do so much better than the state of the art algorithms. The main takeaway lesson from our study is that mechanisms of human vision, particularly the hierarchal organization of the visual ventral stream should be taken into account in clustering algorithms (e.g., for learning representations in an unsupervised manner or with minimum supervision) to reach human level clustering performance. This, by no means, suggests that other methods do not hold merits. For example, methods relying on pairwise affinities (e.g., spectral clustering) have been very successful in many cases but still fail in some cases (e.g., overlapping clusters).", "keywords": "Cluttering;deep learning;human learning", "primary_area": "", "supplementary_material": "", "author": "Ali Borji;Aysegul Dundar", "authorids": "aliborji@gmail.com;adundar@purdue.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Skvin0GWM", "pdf_size": 0, "rating": "3;4", "confidence": "5;5", "rating_avg": 3.5, "confidence_avg": 5.0, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=748222308278447485&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "title": "Temporal Difference Models: Model-Free Deep RL for Model-Based Control", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/108", "id": "Skw0n-W0Z", "author_site": "Vitchyr Pong, Shixiang Gu, Murtaza Dalal, Sergey Levine", "tldr": "We show that a special goal-condition value function trained with model free methods can be used within model-based control, resulting in substantially better sample efficiency and performance.", "abstract": "Model-free reinforcement learning (RL) has been proven to be a powerful, general tool for learning complex behaviors. However, its sample efficiency is often impractically large for solving challenging real-world problems, even for off-policy algorithms such as Q-learning. A limiting factor in classic model-free RL is that the learning signal consists only of scalar rewards, ignoring much of the rich information contained in state transition tuples. Model-based RL uses this information, by training a predictive model, but often does not achieve the same asymptotic performance as model-free RL due to model bias. We introduce temporal difference models (TDMs), a family of goal-conditioned value functions that can be trained with model-free learning and used for model-based control. TDMs combine the benefits of model-free and model-based RL: they leverage the rich information in state transitions to learn very efficiently, while still attaining asymptotic performance that exceeds that of direct model-based RL methods. Our experimental results show that, on a range of continuous control tasks, TDMs provide a substantial improvement in efficiency compared to state-of-the-art model-based and model-free methods.", "keywords": "model-based reinforcement learning;model-free reinforcement learning;temporal difference learning;predictive learning;predictive models;optimal control;off-policy reinforcement learning;deep learning;deep reinforcement learning;q learning", "primary_area": "", "supplementary_material": "", "author": "Vitchyr Pong*;Shixiang Gu*;Murtaza Dalal;Sergey Levine", "authorids": "vitchyr@berkeley.edu;sg717@cam.ac.uk;mdalal@berkeley.edu;svlevine@eecs.berkeley.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\npong*2018temporal,\ntitle={Temporal Difference Models: Model-Free Deep {RL} for Model-Based Control},\nauthor={Vitchyr Pong* and Shixiang Gu* and Murtaza Dalal and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Skw0n-W0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "4;7;7", "confidence": "4;4;3", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": -0.5, "gs_citation": 325, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4445070816897830715&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=Skw0n-W0Z", "pdf": "https://openreview.net/pdf?id=Skw0n-W0Z", "email": ";;;", "author_num": 4 }, { "id": "SkwAEQbAb", "title": "A novel method to determine the number of latent dimensions with SVD", "track": "main", "status": "Reject", "tldr": "In this study, we introduce a novel method that relies on SVD to discover the number of latent dimensions.", "abstract": "Determining the number of latent dimensions is a ubiquitous problem in machine\nlearning. In this study, we introduce a novel method that relies on SVD to discover\nthe number of latent dimensions. The general principle behind the method is to\ncompare the curve of singular values of the SVD decomposition of a data set with\nthe randomized data set curve. The inferred number of latent dimensions corresponds\nto the crossing point of the two curves. To evaluate our methodology, we\ncompare it with competing methods such as Kaisers eigenvalue-greater-than-one\nrule (K1), Parallel Analysis (PA), Velicers MAP test (Minimum Average Partial).\nWe also compare our method with the Silhouette Width (SW) technique which is\nused in different clustering methods to determine the optimal number of clusters.\nThe result on synthetic data shows that the Parallel Analysis and our method have\nsimilar results and more accurate than the other methods, and that our methods is\nslightly better result than the Parallel Analysis method for the sparse data sets.", "keywords": "SVD;Latent Dimensions;Dimension Reductions;Machine Learning", "primary_area": "", "supplementary_material": "", "author": "Asana Neishabouri;Michel Desmarais", "authorids": "asana.neishabouri@polymtl.ca;michel.desmarais@polymtl.ca", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nneishabouri2018a,\ntitle={A novel method to determine the number of latent dimensions with {SVD}},\nauthor={Asana Neishabouri and Michel Desmarais},\nyear={2018},\nurl={https://openreview.net/forum?id=SkwAEQbAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SkwAEQbAb", "pdf_size": 0, "rating": "1;2;3", "confidence": "4;5;4", "rating_avg": 2.0, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:66y5WxHnlhcJ:scholar.google.com/&scioq=A+novel+method+to+determine+the+number+of+latent+dimensions+with+SVD&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "Skx5txzb0W", "title": "A Boo(n) for Evaluating Architecture Performance", "track": "main", "status": "Reject", "tldr": "We point out important problems with the common practice of using the best single model performance for comparing deep learning architectures, and we propose a method that corrects these flaws.", "abstract": "We point out important problems with the common practice of using the best single model performance for comparing deep learning architectures, and we propose a method that corrects these flaws. Each time a model is trained, one gets a different result due to random factors in the training process, which include random parameter initialization and random data shuffling. Reporting the best single model performance does not appropriately address this stochasticity. We propose a normalized expected best-out-of-n performance (Boo_n) as a way to correct these problems.", "keywords": "evaluation;methodology", "primary_area": "", "supplementary_material": "", "author": "Ondrej Bajgar;Rudolf Kadlec;and Jan Kleindienst", "authorids": "ondrej@bajgar.org;rudolf_kadlec@cz.ibm.com;jankle@cz.ibm.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbajgar2018a,\ntitle={A Boon for Evaluating Architecture Performance},\nauthor={Ondrej Bajgar and Rudolf Kadlec and and Jan Kleindienst},\nyear={2018},\nurl={https://openreview.net/forum?id=Skx5txzb0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Skx5txzb0W", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6114335001118425042&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "SkxqZngC-", "title": "A Bayesian Nonparametric Topic Model with Variational Auto-Encoders", "track": "main", "status": "Reject", "tldr": "A Bayesian Nonparametric Topic Model with Variational Auto-Encoders which achieves the state-of-the-arts on public benchmarks in terms of perplexity, topic coherence and retrieval tasks.", "abstract": "Topic modeling of text documents is one of the most important tasks in representation learning. In this work, we propose iTM-VAE, which is a Bayesian nonparametric (BNP) topic model with variational auto-encoders. On one hand, as a BNP topic model, iTM-VAE potentially has infinite topics and can adapt the topic number to data automatically. On the other hand, different with the other BNP topic models, the inference of iTM-VAE is modeled by neural networks, which has rich representation capacity and can be computed in a simple feed-forward manner. Two variants of iTM-VAE are also proposed in this paper, where iTM-VAE-Prod models the generative process in products-of-experts fashion for better performance and iTM-VAE-G places a prior over the concentration parameter such that the model can adapt a suitable concentration parameter to data automatically. Experimental results on 20News and Reuters RCV1-V2 datasets show that the proposed models outperform the state-of-the-arts in terms of perplexity, topic coherence and document retrieval tasks. Moreover, the ability of adjusting the concentration parameter to data is also confirmed by experiments.", "keywords": "topic model;Bayesian nonparametric;variational auto-encoder;document modeling", "primary_area": "", "supplementary_material": "", "author": "Xuefei Ning;Yin Zheng;Zhuxi Jiang;Yu Wang;Huazhong Yang;Junzhou Huang", "authorids": "foxdoraame@gmail.com;yzheng3xg@gmail.com;zjiang9310@gmail.com;yu-wang@mail.tsinghua.edu.cn;yanghz@tsinghua.edu.cn;joehhuang@tencent.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nning2018a,\ntitle={A Bayesian Nonparametric Topic Model with Variational Auto-Encoders},\nauthor={Xuefei Ning and Yin Zheng and Zhuxi Jiang and Yu Wang and Huazhong Yang and Junzhou Huang},\nyear={2018},\nurl={https://openreview.net/forum?id=SkxqZngC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SkxqZngC-", "pdf_size": 0, "rating": "3;5;7", "confidence": "4;2;4", "rating_avg": 5.0, "confidence_avg": 3.3333333333333335, "replies_avg": 12, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12565003092399860909&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "SkymMAxAb", "title": "AirNet: a machine learning dataset for air quality forecasting", "track": "main", "status": "Reject", "tldr": "", "abstract": "In the past decade, many urban areas in China have suffered from serious air pollution problems, making air quality forecast a hot spot. Conventional approaches rely on numerical methods to estimate the pollutant concentration and require lots of computing power. To solve this problem, we applied the widely used deep learning methods. Deep learning requires large-scale datasets to train an effective model. In this paper, we introduced a new dataset, entitled as AirNet, containing the 0.25 degree resolution grid map of mainland China, with more than two years of continued air quality measurement and meteorological data. We published this dataset as an open resource for machine learning researches and set up a baseline of a 5-day air pollution forecast. The results of experiments demonstrated that this dataset could facilitate the development of new algorithms on the air quality forecast.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Songgang Zhao;Xingyuan Yuan;Da Xiao;Jianyuan Zhang;Zhouyuan Li", "authorids": "gfgkmn@gmail.com;yuan@caiyunapp.com;xiaoda99@gmail.com;littletree@caiyunapp.com;joeyzhouyuanli@caiyunapp.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nzhao2018airnet,\ntitle={AirNet: a machine learning dataset for air quality forecasting},\nauthor={Songgang Zhao and Xingyuan Yuan and Da Xiao and Jianyuan Zhang and Zhouyuan Li},\nyear={2018},\nurl={https://openreview.net/forum?id=SkymMAxAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SkymMAxAb", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6909298917509499428&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "title": "A PAC-Bayesian Approach to Spectrally-Normalized Margin Bounds for Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/93", "id": "Skz_WfbCZ", "author_site": "Behnam Neyshabur, Srinadh Bhojanapalli, Nathan Srebro", "tldr": "", "abstract": "We present a generalization bound for feedforward neural networks in terms of the product of the spectral norm of the layers and the Frobenius norm of the weights. The generalization bound is derived using a PAC-Bayes analysis.", "keywords": "Neural Networks;Generalization;PAC-Bayes;Sharpness", "primary_area": "", "supplementary_material": "", "author": "Behnam Neyshabur;Srinadh Bhojanapalli;Nathan Srebro", "authorids": "bneyshabur@ttic.edu;srinadh@ttic.edu;nati@ttic.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nneyshabur2018a,\ntitle={A {PAC}-Bayesian Approach to Spectrally-Normalized Margin Bounds for Neural Networks},\nauthor={Behnam Neyshabur and Srinadh Bhojanapalli and Nathan Srebro},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Skz_WfbCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;9", "confidence": "3;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": 0.7559289460184545, "gs_citation": 736, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13808344181878972186&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=Skz_WfbCZ", "pdf": "https://openreview.net/pdf?id=Skz_WfbCZ", "email": ";;", "author_num": 3 }, { "title": "Neural Speed Reading via Skim-RNN", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/79", "id": "Sy-dQG-Rb", "author_site": "Minjoon Seo, Sewon Min, Ali Farhadi, Hannaneh Hajishirzi", "tldr": "", "abstract": "Inspired by the principles of speed reading, we introduce Skim-RNN, a recurrent neural network (RNN) that dynamically decides to update only a small fraction of the hidden state for relatively unimportant input tokens. Skim-RNN gives a significant computational advantage over an RNN that always updates the entire hidden state. Skim-RNN uses the same input and output interfaces as a standard RNN and can be easily used instead of RNNs in existing models. In our experiments, we show that Skim-RNN can achieve significantly reduced computational cost without losing accuracy compared to standard RNNs across five different natural language tasks. In addition, we demonstrate that the trade-off between accuracy and speed of Skim-RNN can be dynamically controlled during inference time in a stable manner. Our analysis also shows that Skim-RNN running on a single CPU offers lower latency compared to standard RNNs on GPUs.", "keywords": "Natural Language Processing;RNN;Inference Speed", "primary_area": "", "supplementary_material": "", "author": "Minjoon Seo;Sewon Min;Ali Farhadi;Hannaneh Hajishirzi", "authorids": "minjoon@cs.washington.edu;shmsw25@snu.ac.kr;ali@cs.washington.edu;hannaneh@washington.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nseo2018neural,\ntitle={Neural Speed Reading via Skim-{RNN}},\nauthor={Minjoon Seo and Sewon Min and Ali Farhadi and Hannaneh Hajishirzi},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Sy-dQG-Rb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;8", "confidence": "3;3;3", "rating_avg": 7.333333333333333, "confidence_avg": 3.0, "replies_avg": 17, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8873138418966688907&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=Sy-dQG-Rb", "pdf": "https://openreview.net/pdf?id=Sy-dQG-Rb", "email": ";;;", "author_num": 4 }, { "id": "Sy-tszZRZ", "title": "Bounding and Counting Linear Regions of Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "We empirically count the number of linear regions of rectifier networks and refine upper and lower bounds.", "abstract": "In this paper, we study the representational power of deep neural networks (DNN) that belong to the family of piecewise-linear (PWL) functions, based on PWL activation units such as rectifier or maxout. We investigate the complexity of such networks by studying the number of linear regions of the PWL function. Typically, a PWL function from a DNN can be seen as a large family of linear functions acting on millions of such regions. We directly build upon the work of Mont\u00b4ufar et al. (2014), Mont\u00b4ufar (2017), and Raghu et al. (2017) by refining the upper and lower bounds on the number of linear regions for rectified and maxout networks. In addition to achieving tighter bounds, we also develop a novel method to perform exact numeration or counting of the number of linear regions with a mixed-integer linear formulation that maps the input space to output. We use this new capability to visualize how the number of linear regions change while training DNNs. ", "keywords": "rectifier networks;maxout networks;piecewise linear functions;linear regions;mixed-integer programming", "primary_area": "", "supplementary_material": "", "author": "Thiago Serra;Christian Tjandraatmadja;Srikumar Ramalingam", "authorids": "tserra@gmail.com;ctjandra@andrew.cmu.edu;srikumar.ramalingam@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nserra2018bounding,\ntitle={Bounding and Counting Linear Regions of Deep Neural Networks},\nauthor={Thiago Serra and Christian Tjandraatmadja and Srikumar Ramalingam},\nyear={2018},\nurl={https://openreview.net/forum?id=Sy-tszZRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Sy-tszZRZ", "pdf_size": 0, "rating": "4;6;6", "confidence": "5;5;3", "rating_avg": 5.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 345, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13037973585758872519&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "title": "Emergent Complexity via Multi-Agent Competition", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/260", "id": "Sy0GnUxCb", "author_site": "Trapit Bansal, Jakub Pachocki, Szymon Sidor, Ilya Sutskever, Igor Mordatch", "tldr": "", "abstract": "Reinforcement learning algorithms can train agents that solve problems in complex, interesting environments. Normally, the complexity of the trained agent is closely related to the complexity of the environment. This suggests that a highly capable agent requires a complex environment for training. In this paper, we point out that a competitive multi-agent environment trained with self-play can produce behaviors that are far more complex than the environment itself. We also point out that such environments come with a natural curriculum, because for any skill level, an environment full of agents of this level will have the right level of difficulty.\nThis work introduces several competitive multi-agent environments where agents compete in a 3D world with simulated physics. The trained agents learn a wide variety of complex and interesting skills, even though the environment themselves are relatively simple. The skills include behaviors such as running, blocking, ducking, tackling, fooling opponents, kicking, and defending using both arms and legs. A highlight of the learned behaviors can be found here: https://goo.gl/eR7fbX", "keywords": "multi-agent systems;multi-agent competition;self-play;deep reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Trapit Bansal;Jakub Pachocki;Szymon Sidor;Ilya Sutskever;Igor Mordatch", "authorids": "tbansal@cs.umass.edu;jakub@openai.com;szymon@openai.com;ilyasu@openai.com;mordatch@openai.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nbansal2018emergent,\ntitle={Emergent Complexity via Multi-Agent Competition},\nauthor={Trapit Bansal and Jakub Pachocki and Szymon Sidor and Ilya Sutskever and Igor Mordatch},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Sy0GnUxCb},\n}", "github": "[![github](/images/github_icon.svg) openai/multiagent-competition](https://github.com/openai/multiagent-competition) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=Sy0GnUxCb)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "3;7;9", "confidence": "3;4;5", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 5, "corr_rating_confidence": 0.9819805060619659, "gs_citation": 524, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12865596457557919071&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=Sy0GnUxCb", "pdf": "https://openreview.net/pdf?id=Sy0GnUxCb", "email": ";;;;", "author_num": 5 }, { "id": "Sy1f0e-R-", "title": "An empirical study on evaluation metrics of generative adversarial networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Despite the widespread interest in generative adversarial networks (GANs), few works have studied the metrics that quantitatively evaluate GANs' performance. In this paper, we revisit several representative sample-based evaluation metrics for GANs, and address the important problem of \\emph{how to evaluate the evaluation metrics}. We start with a few necessary conditions for metrics to produce meaningful scores, such as distinguishing real from generated samples, identifying mode dropping and mode collapsing, and detecting overfitting. Then with a series of carefully designed experiments, we are able to comprehensively investigate existing sample-based metrics and identify their strengths and limitations in practical settings. Based on these results, we observe that kernel Maximum Mean Discrepancy (MMD) and the 1-Nearest-Neighbour (1-NN) two-sample test seem to satisfy most of the desirable properties, provided that the distances between samples are computed in a suitable feature space. Our experiments also unveil interesting properties about the behavior of several popular GAN models, such as whether they are memorizing training samples, and how far these state-of-the-art GANs are from perfect.", "keywords": "generative adversarial networks;evaluation metric", "primary_area": "", "supplementary_material": "", "author": "Gao Huang;Yang Yuan;Qiantong Xu;Chuan Guo;Yu Sun;Felix Wu;Kilian Weinberger", "authorids": "gh349@cornell.edu;yy528@cornell.edu;qx57@cornell.edu;cg563@cornell.edu;yusun@berkeley.edu;fw245@cornell.edu;kqw4@cornell.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nhuang2018an,\ntitle={An empirical study on evaluation metrics of generative adversarial networks},\nauthor={Gao Huang and Yang Yuan and Qiantong Xu and Chuan Guo and Yu Sun and Felix Wu and Kilian Weinberger},\nyear={2018},\nurl={https://openreview.net/forum?id=Sy1f0e-R-},\n}", "github": "[![github](/images/github_icon.svg) xuqiantong/GAN-Metrics](https://github.com/xuqiantong/GAN-Metrics) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=Sy1f0e-R-)", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Sy1f0e-R-", "pdf_size": 0, "rating": "5;5;7;8", "confidence": "3;5;4;3", "rating_avg": 6.25, "confidence_avg": 3.75, "replies_avg": 18, "authors#_avg": 7, "corr_rating_confidence": -0.4061811972299616, "gs_citation": 395, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10986270796266305228&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "title": "Towards better understanding of gradient-based attribution methods for Deep Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/302", "id": "Sy21R9JAW", "author_site": "Marco Ancona, Enea Ceolini, Cengiz \u00d6ztireli, Markus Gross", "tldr": "Four existing backpropagation-based attribution methods are fundamentally similar. How to assess it?", "abstract": "Understanding the flow of information in Deep Neural Networks (DNNs) is a challenging problem that has gain increasing attention over the last few years. While several methods have been proposed to explain network predictions, there have been only a few attempts to compare them from a theoretical perspective. What is more, no exhaustive empirical comparison has been performed in the past. In this work we analyze four gradient-based attribution methods and formally prove conditions of equivalence and approximation between them. By reformulating two of these methods, we construct a unified framework which enables a direct comparison, as well as an easier implementation. Finally, we propose a novel evaluation metric, called Sensitivity-n and test the gradient-based attribution methods alongside with a simple perturbation-based attribution method on several datasets in the domains of image and text classification, using various network architectures.", "keywords": "Deep Neural Networks;Attribution methods;Theory of deep learning", "primary_area": "", "supplementary_material": "", "author": "Marco Ancona;Enea Ceolini;Cengiz \u00d6ztireli;Markus Gross", "authorids": "marco.ancona@inf.ethz.ch;enea.ceolini@ini.uzh.ch;cengizo@inf.ethz.ch;grossm@inf.ethz.ch", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nancona2018towards,\ntitle={Towards better understanding of gradient-based attribution methods for Deep Neural Networks},\nauthor={Marco Ancona and Enea Ceolini and Cengiz \u00d6ztireli and Markus Gross},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Sy21R9JAW},\n}", "github": "[![github](/images/github_icon.svg) kundajelab/deeplift](https://github.com/kundajelab/deeplift) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=Sy21R9JAW)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "5;3;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1347, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7129422820232184089&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 14, "openreview": "https://openreview.net/forum?id=Sy21R9JAW", "pdf": "https://openreview.net/pdf?id=Sy21R9JAW", "email": ";;;", "author_num": 4 }, { "title": "Unsupervised Neural Machine Translation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/168", "id": "Sy2ogebAW", "author_site": "Mikel Artetxe, Gorka Labaka, Eneko Agirre, Kyunghyun Cho", "tldr": "We introduce the first successful method to train neural machine translation in an unsupervised manner, using nothing but monolingual corpora", "abstract": "In spite of the recent success of neural machine translation (NMT) in standard benchmarks, the lack of large parallel corpora poses a major practical problem for many language pairs. There have been several proposals to alleviate this issue with, for instance, triangulation and semi-supervised learning techniques, but they still require a strong cross-lingual signal. In this work, we completely remove the need of parallel data and propose a novel method to train an NMT system in a completely unsupervised manner, relying on nothing but monolingual corpora. Our model builds upon the recent work on unsupervised embedding mappings, and consists of a slightly modified attentional encoder-decoder model that can be trained on monolingual corpora alone using a combination of denoising and backtranslation. Despite the simplicity of the approach, our system obtains 15.56 and 10.21 BLEU points in WMT 2014 French-to-English and German-to-English translation. The model can also profit from small parallel corpora, and attains 21.81 and 15.24 points when combined with 100,000 parallel sentences, respectively. Our implementation is released as an open source project.", "keywords": "neural machine translation;unsupervised learning", "primary_area": "", "supplementary_material": "", "author": "Mikel Artetxe;Gorka Labaka;Eneko Agirre;Kyunghyun Cho", "authorids": "mikel.artetxe@ehu.eus;gorka.labaka@ehu.eus;e.agirre@ehu.eus;kyunghyun.cho@nyu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nartetxe2018unsupervised,\ntitle={Unsupervised Neural Machine Translation},\nauthor={Mikel Artetxe and Gorka Labaka and Eneko Agirre and Kyunghyun Cho},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Sy2ogebAW},\n}", "github": "[![github](/images/github_icon.svg) rsennrich/subword-nmt](https://github.com/rsennrich/subword-nmt) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=Sy2ogebAW)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;5", "rating_avg": 6.0, "confidence_avg": 4.333333333333333, "replies_avg": 16, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 1070, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6109181985493123662&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=Sy2ogebAW", "pdf": "https://openreview.net/pdf?id=Sy2ogebAW", "email": ";;;", "author_num": 4 }, { "id": "Sy3XxCx0Z", "title": "Natural Language Inference with External Knowledge", "track": "main", "status": "Workshop", "tldr": "the proposed models with external knowledge further improve the state of the art on the SNLI dataset.", "abstract": "Modeling informal inference in natural language is very challenging. With the recent availability of large annotated data, it has become feasible to train complex models such as neural networks to perform natural language inference (NLI), which have achieved state-of-the-art performance. Although there exist relatively large annotated data, can machines learn all knowledge needed to perform NLI from the data? If not, how can NLI models benefit from external knowledge and how to build NLI models to leverage it? In this paper, we aim to answer these questions by enriching the state-of-the-art neural natural language inference models with external knowledge. We demonstrate that the proposed models with external knowledge further improve the state of the art on the Stanford Natural Language Inference (SNLI) dataset. ", "keywords": "natural language inference;external knowledge;state of the art", "primary_area": "", "supplementary_material": "", "author": "Qian Chen;Xiaodan Zhu;Zhen-Hua Ling;Diana Inkpen", "authorids": "cq1231@mail.ustc.edu.cn;xiaodan.zhu@queensu.ca;zhling@ustc.edu.cn;diana.inkpen@uottawa.ca", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nchen2018natural,\ntitle={Natural Language Inference with External Knowledge},\nauthor={Qian Chen and Xiaodan Zhu and Zhen-Hua Ling and Diana Inkpen},\nyear={2018},\nurl={https://openreview.net/forum?id=Sy3XxCx0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Sy3XxCx0Z", "pdf_size": 0, "rating": "3;5;6;7", "confidence": "5;4;5;4", "rating_avg": 5.25, "confidence_avg": 4.5, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": -0.50709255283711, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16671344003810594601&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "Sy3fJXbA-", "title": "Connectivity Learning in Multi-Branch Networks", "track": "main", "status": "Reject", "tldr": "In this paper we introduced an algorithm to learn the connectivity of deep multi-branch networks. The approach is evaluated on image categorization where it consistently yields accuracy gains over state-of-the-art models that use fixed connectivity.", "abstract": "While much of the work in the design of convolutional networks over the last five years has revolved around the empirical investigation of the importance of depth, filter sizes, and number of feature channels, recent studies have shown that branching, i.e., splitting the computation along parallel but distinct threads and then aggregating their outputs, represents a new promising dimension for significant improvements in performance. To combat the complexity of design choices in multi-branch architectures, prior work has adopted simple strategies, such as a fixed branching factor, the same input being fed to all parallel branches, and an additive combination of the outputs produced by all branches at aggregation points. \n\nIn this work we remove these predefined choices and propose an algorithm to learn the connections between branches in the network. Instead of being chosen a priori by the human designer, the multi-branch connectivity is learned simultaneously with the weights of the network by optimizing a single loss function defined with respect to the end task. We demonstrate our approach on the problem of multi-class image classification using four different datasets where it yields consistently higher accuracy compared to the state-of-the-art ``ResNeXt'' multi-branch network given the same learning capacity.", "keywords": "connectivity learning;multi-branch networks;image categorization", "primary_area": "", "supplementary_material": "", "author": "Karim Ahmed;Lorenzo Torresani", "authorids": "karim.mmm@gmail.com;lt@dartmouth.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nahmed2018connectivity,\ntitle={Connectivity Learning in Multi-Branch Networks},\nauthor={Karim Ahmed and Lorenzo Torresani},\nyear={2018},\nurl={https://openreview.net/forum?id=Sy3fJXbA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Sy3fJXbA-", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;4;5", "rating_avg": 5.0, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13484831088292319177&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "id": "Sy3nGCYXz", "title": "THE LOCAL DIMENSION OF DEEP MANIFOLD", "track": "main", "status": "Withdraw", "tldr": "We propose a SVD based method to explore the local dimension of activation manifold in deep neural networks.", "abstract": "Based on our observation that there exists a dramatic drop for the singular values of the fully connected layers or a single feature map of the convolutional layer, and that the dimension of the concatenated feature vector almost equals the summation of the dimension on each feature map, we propose a singular value decomposition (SVD) based approach to estimate the dimension of the deep manifolds for a typical convolutional neural network VGG19. We choose three categories from the ImageNet, namely Persian Cat, Container Ship and Volcano, and determine the local dimension of the deep manifolds of the deep layers through the tangent space of a target image. Through several augmentation methods, we found that the Gaussian noise method is closer to the intrinsic dimension, as by adding random noise to an image we are moving in an arbitrary dimension, and when the rank of the feature matrix of the augmented images does not increase we are very close\nto the local dimension of the manifold. We also estimate the dimension of the deep manifold based on the tangent space for each of the maxpooling layers. Our results show that the dimensions of different categories are close to each other and decline quickly along the convolutional layers and fully connected layers. Furthermore, we show that the dimensions decline quickly inside the Conv5 layer. Our work provides new insights for the intrinsic structure of deep neural networks and helps unveiling the inner organization of the black box of deep neural networks.", "keywords": "activation manifold;dimension;deep neural network;singular value decomposition", "primary_area": "", "supplementary_material": "", "author": "Mengxiao Zhang;Wangquan Wu;Yanren Zhang;Kun He;Tao Yu;Huan Long;John E. Hopcroft", "authorids": "zmx@hust.edu.cn;u201514497@hust.edu.cn;hhxjzyr@hust.edu.cn;brooklet60@hust.edu.cn;ydtydr@sjtu.edu.cn;longhuan@cs.sjtu.edu.cn;jeh@cs.cornell.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Sy3nGCYXz", "pdf_size": 0, "rating": "3;3;5", "confidence": "3;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 3.6666666666666665, "replies_avg": 3, "authors#_avg": 7, "corr_rating_confidence": 0.5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6190236140213096317&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "Sy4c-3xRW", "title": "DropMax: Adaptive Stochastic Softmax", "track": "main", "status": "Workshop", "tldr": "", "abstract": "We propose DropMax, a stochastic version of softmax classifier which at each iteration drops non-target classes with some probability, for each instance. Specifically, we overlay binary masking variables over class output probabilities, which are learned based on the input via regularized variational inference. This stochastic regularization has an effect of building an ensemble classifier out of combinatorial number of classifiers with different decision boundaries. Moreover, the learning of dropout probabilities for non-target classes on each instance allows the classifier to focus more on classification against the most confusing classes. We validate our model on multiple public datasets for classification, on which it obtains improved accuracy over regular softmax classifier and other baselines. Further analysis of the learned dropout masks shows that our model indeed selects confusing classes more often when it performs classification.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hae Beom Lee;Juho Lee;Eunho Yang;Sung Ju Hwang", "authorids": "hblee@unist.ac.kr;stonecold@postech.ac.kr;yangeh@gmail.com;sjhwang82@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbeom2018dropmax,\ntitle={DropMax: Adaptive Stochastic Softmax},\nauthor={Hae Beom Lee and Juho Lee and Eunho Yang and Sung Ju Hwang},\nyear={2018},\nurl={https://openreview.net/forum?id=Sy4c-3xRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Sy4c-3xRW", "pdf_size": 0, "rating": "4;6;6", "confidence": "3;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4012034064776947134&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "Sy5OAyZC-", "title": "On the Use of Word Embeddings Alone to Represent Natural Language Sequences", "track": "main", "status": "Reject", "tldr": "", "abstract": "To construct representations for natural language sequences, information from two main sources needs to be captured: (i) semantic meaning of individual words, and (ii) their compositionality. These two types of information are usually represented in the form of word embeddings and compositional functions, respectively. For the latter, Recurrent Neural Networks (RNNs) and Convolutional Neural Networks (CNNs) have been considered. There has not been a rigorous evaluation regarding the relative importance of each component to different text-representation-based tasks; i.e., how important is the modeling capacity of word embeddings alone, relative to the added value of a compositional function? In this paper, we conduct an extensive comparative study between Simple Word Embeddings-based Models (SWEMs), with no compositional parameters, relative to employing word embeddings within RNN/CNN-based models. Surprisingly, SWEMs exhibit comparable or even superior performance in the majority of cases considered. Moreover, in a new SWEM setup, we propose to employ a max-pooling operation over the learned word-embedding matrix of a given sentence. This approach is demonstrated to extract complementary features relative to the averaging operation standard to SWEMs, while endowing our model with better interpretability. To further validate our observations, we examine the information utilized by different models to make predictions, revealing interesting properties of word embeddings.\n", "keywords": "Natural Language Processing;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Dinghan Shen;Guoyin Wang;Wenlin Wang;Martin Renqiang Min;Qinliang Su;Yizhe Zhang;Ricardo Henao;Lawrence Carin", "authorids": "dinghan.shen@duke.edu;guoyin.wang@duke.edu;wenlin.wang@duke.edu;renqiang@nec-labs.com;qinliang.su@duke.edu;yizhe.zhang@duke.edu;ricardo.henao@duke.edu;lcarin@duke.edu", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\nshen2018on,\ntitle={On the Use of Word Embeddings Alone to Represent Natural Language Sequences},\nauthor={Dinghan Shen and Guoyin Wang and Wenlin Wang and Martin Renqiang Min and Qinliang Su and Yizhe Zhang and Ricardo Henao and Lawrence Carin},\nyear={2018},\nurl={https://openreview.net/forum?id=Sy5OAyZC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=Sy5OAyZC-", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;5;4", "rating_avg": 6.0, "confidence_avg": 4.333333333333333, "replies_avg": 18, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2327646326183009614&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Latent Constraints: Learning to Generate Conditionally from Unconditional Generative Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/62", "id": "Sy8XvGb0-", "author_site": "Jesse Engel, Matthew D Hoffman, Adam Roberts", "tldr": "A new approach to conditional generation by constraining the latent space of an unconditional generative model.", "abstract": "Deep generative neural networks have proven effective at both conditional and unconditional modeling of complex data distributions. Conditional generation enables interactive control, but creating new controls often requires expensive retraining. In this paper, we develop a method to condition generation without retraining the model. By post-hoc learning latent constraints, value functions identify regions in latent space that generate outputs with desired attributes, we can conditionally sample from these regions with gradient-based optimization or amortized actor functions. Combining attribute constraints with a universal \u201crealism\u201d constraint, which enforces similarity to the data distribution, we generate realistic conditional images from an unconditional variational autoencoder. Further, using gradient-based optimization, we demonstrate identity-preserving transformations that make the minimal adjustment in latent space to modify the attributes of an image. Finally, with discrete sequences of musical notes, we demonstrate zero-shot conditional generation, learning latent constraints in the absence of labeled data or a differentiable reward function.", "keywords": "VAE;GAN;generative networks;conditional generation;latent-variable models", "primary_area": "", "supplementary_material": "", "author": "Jesse Engel;Matthew Hoffman;Adam Roberts", "authorids": "jesseengel@google.com;mhoffman@google.com;adarob@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nengel2018latent,\ntitle={Latent Constraints: Learning to Generate Conditionally from Unconditional Generative Models},\nauthor={Jesse Engel and Matthew Hoffman and Adam Roberts},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Sy8XvGb0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;4;3", "rating_avg": 7.0, "confidence_avg": 3.3333333333333335, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 171, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4801471707105268793&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=Sy8XvGb0-", "pdf": "https://openreview.net/pdf?id=Sy8XvGb0-", "email": ";;", "author_num": 3 }, { "id": "SyAbZb-0Z", "title": "Transfer Learning to Learn with Multitask Neural Model Search", "track": "main", "status": "Reject", "tldr": "We present Multitask Neural Model Search, a Meta-learner that can design models for multiple tasks simultaneously and transfer learning to unseen tasks.", "abstract": "Deep learning models require extensive architecture design exploration and hyperparameter optimization to perform well on a given task. The exploration of the model design space is often made by a human expert, and optimized using a combination of grid search and search heuristics over a large space of possible choices. Neural Architecture Search (NAS) is a Reinforcement Learning approach that has been proposed to automate architecture design. NAS has been successfully applied to generate Neural Networks that rival the best human-designed architectures. However, NAS requires sampling, constructing, and training hundreds to thousands of models to achieve well-performing architectures. This procedure needs to be executed from scratch for each new task. The application of NAS to a wide set of tasks currently lacks a way to transfer generalizable knowledge across tasks.\nIn this paper, we present the Multitask Neural Model Search (MNMS) controller. Our goal is to learn a generalizable framework that can condition model construction on successful model searches for previously seen tasks, thus significantly speeding up the search for new tasks. We demonstrate that MNMS can conduct an automated architecture search for multiple tasks simultaneously while still learning well-performing, specialized models for each task. We then show that pre-trained MNMS controllers can transfer learning to new tasks. By leveraging knowledge from previous searches, we find that pre-trained MNMS models start from a better location in the search space and reduce search time on unseen tasks, while still discovering models that outperform published human-designed models.", "keywords": "Learning to Learn;Meta learning;Reinforcement learning;Transfer learning", "primary_area": "", "supplementary_material": "", "author": "Catherine Wong;Andrea Gesmundo", "authorids": "catwong@cs.stanford.edu;agesmundo@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nwong2018transfer,\ntitle={Transfer Learning to Learn with Multitask Neural Model Search},\nauthor={Catherine Wong and Andrea Gesmundo},\nyear={2018},\nurl={https://openreview.net/forum?id=SyAbZb-0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SyAbZb-0Z", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;2;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.0, "replies_avg": 11, "authors#_avg": 2, "corr_rating_confidence": -0.3273268353539886, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12568256917610784202&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "SyBBgXWAZ", "title": "Optimal transport maps for distribution preserving operations on latent spaces of Generative Models", "track": "main", "status": "Reject", "tldr": "Operations in the GAN latent space can induce a distribution mismatch compared to the training distribution, and we address this using optimal transport to match the distributions. ", "abstract": "Generative models such as Variational Auto Encoders (VAEs) and Generative Adversarial Networks (GANs) are typically trained for a fixed prior distribution in the latent space, such as uniform or Gaussian.\nAfter a trained model is obtained, one can sample the Generator in various forms for exploration and understanding, such as interpolating between two samples, sampling in the vicinity of a sample or exploring differences between a pair of samples applied to a third sample.\nIn this paper, we show that the latent space operations used in the literature so far induce a distribution mismatch between the resulting outputs and the prior distribution the model was trained on. To address this, we propose to use distribution matching transport maps to ensure that such latent space operations preserve the prior distribution, while minimally modifying the original operation. \nOur experimental results validate that the proposed operations give higher quality samples compared to the original operations.", "keywords": "Generative Models;GANs;latent space operations;optimal transport", "primary_area": "", "supplementary_material": "", "author": "Eirikur Agustsson;Alexander Sage;Radu Timofte;Luc Van Gool", "authorids": "aeirikur@vision.ee.ethz.ch;sagea@student.ethz.ch;radu.timofte@vision.ee.ethz.ch;vangool@vision.ee.ethz.ch", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nagustsson2018optimal,\ntitle={Optimal transport maps for distribution preserving operations on latent spaces of Generative Models},\nauthor={Eirikur Agustsson and Alexander Sage and Radu Timofte and Luc Van Gool},\nyear={2018},\nurl={https://openreview.net/forum?id=SyBBgXWAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SyBBgXWAZ", "pdf_size": 0, "rating": "4;6;6", "confidence": "3;3;4", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11631258824141665090&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "title": "Implicit Causal Models for Genome-wide Association Studies", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/274", "id": "SyELrEeAb", "author_site": "Dustin Tran, David Blei", "tldr": "Implicit models applied to causality and genetics", "abstract": "Progress in probabilistic generative models has accelerated, developing richer models with neural architectures, implicit densities, and with scalable algorithms for their Bayesian inference. However, there has been limited progress in models that capture causal relationships, for example, how individual genetic factors cause major human diseases. In this work, we focus on two challenges in particular: How do we build richer causal models, which can capture highly nonlinear relationships and interactions between multiple causes? How do we adjust for latent confounders, which are variables influencing both cause and effect and which prevent learning of causal relationships? To address these challenges, we synthesize ideas from causality and modern probabilistic modeling. For the first, we describe implicit causal models, a class of causal models that leverages neural architectures with an implicit density. For the second, we describe an implicit causal model that adjusts for confounders by sharing strength across examples. In experiments, we scale Bayesian inference on up to a billion genetic measurements. We achieve state of the art accuracy for identifying causal factors: we significantly outperform the second best result by an absolute difference of 15-45.3%.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dustin Tran;David M. Blei", "authorids": "dustin@cs.columbia.edu;david.blei@columbia.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\ntran2018implicit,\ntitle={Implicit Causal Models for Genome-wide Association Studies},\nauthor={Dustin Tran and David M. Blei},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SyELrEeAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;6;6", "confidence": "5;5;5", "rating_avg": 5.666666666666667, "confidence_avg": 5.0, "replies_avg": 16, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16620947648071858910&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=SyELrEeAb", "pdf": "https://openreview.net/pdf?id=SyELrEeAb", "email": ";", "author_num": 2 }, { "id": "SyF7Erp6W", "title": "Learning to play slot cars and Atari 2600 games in just minutes", "track": "main", "status": "Reject", "tldr": "Continental-philosophy-inspired approach to learn with few data.", "abstract": "Machine learning algorithms for controlling devices will need to learn quickly, with few trials. Such a goal can be attained with concepts borrowed from continental philosophy and formalized using tools from the mathematical theory of categories. Illustrations of this approach are presented on a cyberphysical system: the slot car game, and also on Atari 2600 games.", "keywords": "Artificial Intelligence;Signal processing;Philosophy;Analogy;ALE;Slot Car", "primary_area": "", "supplementary_material": "", "author": "Lionel Cordesses;Omar Bentahar;Julien Page", "authorids": "lionel.cordesses@renault.com;omar.bentahar@renault.com;ju.page@hotmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ncordesses2018learning,\ntitle={Learning to play slot cars and Atari 2600 games in just minutes},\nauthor={Lionel Cordesses and Omar Bentahar and Julien Page},\nyear={2018},\nurl={https://openreview.net/forum?id=SyF7Erp6W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SyF7Erp6W", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;2;1", "rating_avg": 2.6666666666666665, "confidence_avg": 2.6666666666666665, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": -0.9707253433941506, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "SyGT_6yCZ", "title": "Simple Fast Convolutional Feature Learning", "track": "main", "status": "Reject", "tldr": "A simple fast method for extracting visual features from convolutional neural networks", "abstract": "The quality of the features used in visual recognition is of fundamental importance for the overall system. For a long time, low-level hand-designed feature algorithms as SIFT and HOG have obtained the best results on image recognition. Visual features have recently been extracted from trained convolutional neural networks. Despite the high-quality results, one of the main drawbacks of this approach, when compared with hand-designed features, is the training time required during the learning process. In this paper, we propose a simple and fast way to train supervised convolutional models to feature extraction while still maintaining its high-quality. This methodology is evaluated on different datasets and compared with state-of-the-art approaches.", "keywords": "Feature Learning;Convolutional Neural Networks;Visual Recognition", "primary_area": "", "supplementary_material": "", "author": "David Mac\u00eado;Cleber Zanchettin;Teresa Ludermir", "authorids": "dlm@cin.ufpe.br;cz@cin.ufpe.br;tbl@cin.ufpe.br", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmac\u00eado2018simple,\ntitle={Simple Fast Convolutional Feature Learning},\nauthor={David Mac\u00eado and Cleber Zanchettin and Teresa Ludermir},\nyear={2018},\nurl={https://openreview.net/forum?id=SyGT_6yCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SyGT_6yCZ", "pdf_size": 0, "rating": "2;3;3", "confidence": "4;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6241372502100537763&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Countering Adversarial Images using Input Transformations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/139", "id": "SyJ7ClWCb", "author_site": "Chuan Guo, Mayank Rana, Moustapha Cisse, Laurens van der Maaten", "tldr": "We apply a model-agnostic defense strategy against adversarial examples and achieve 60% white-box accuracy and 90% black-box accuracy against major attack algorithms.", "abstract": "This paper investigates strategies that defend against adversarial-example attacks on image-classification systems by transforming the inputs before feeding them to the system. Specifically, we study applying image transformations such as bit-depth reduction, JPEG compression, total variance minimization, and image quilting before feeding the image to a convolutional network classifier. Our experiments on ImageNet show that total variance minimization and image quilting are very effective defenses in practice, in particular, when the network is trained on transformed images. The strength of those defenses lies in their non-differentiable nature and their inherent randomness, which makes it difficult for an adversary to circumvent the defenses. Our best defense eliminates 60% of strong gray-box and 90% of strong black-box attacks by a variety of major attack methods.", "keywords": "adversarial example;machine learning security;computer vision;image classification", "primary_area": "", "supplementary_material": "", "author": "Chuan Guo;Mayank Rana;Moustapha Cisse;Laurens van der Maaten", "authorids": "cg563@cornell.edu;mayankrana@fb.com;moustaphacisse@fb.com;lvdmaaten@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nguo2018countering,\ntitle={Countering Adversarial Images using Input Transformations},\nauthor={Chuan Guo and Mayank Rana and Moustapha Cisse and Laurens van der Maaten},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SyJ7ClWCb},\n}", "github": "[![github](/images/github_icon.svg) facebookresearch/adversarial_image_defenses](https://github.com/facebookresearch/adversarial_image_defenses)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "4;7;8", "confidence": "3;3;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 32, "authors#_avg": 4, "corr_rating_confidence": 0.6933752452815364, "gs_citation": 1821, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3375700876994648267&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=SyJ7ClWCb", "pdf": "https://openreview.net/pdf?id=SyJ7ClWCb", "email": ";;;", "author_num": 4 }, { "title": "Multi-level Residual Networks from Dynamical Systems View", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/253", "id": "SyJS-OgR-", "author_site": "Bo Chang, Lili Meng, Eldad Haber, Frederick Tung, David Begert", "tldr": "", "abstract": "Deep residual networks (ResNets) and their variants are widely used in many computer vision applications and natural language processing tasks. However, the theoretical principles for designing and training ResNets are still not fully understood. Recently, several points of view have emerged to try to interpret ResNet theoretically, such as unraveled view, unrolled iterative estimation and dynamical systems view. In this paper, we adopt the dynamical systems point of view, and analyze the lesioning properties of ResNet both theoretically and experimentally. Based on these analyses, we additionally propose a novel method for accelerating ResNet training. We apply the proposed method to train ResNets and Wide ResNets for three image classification benchmarks, reducing training time by more than 40\\% with superior or on-par accuracy.", "keywords": "residual networks;dynamical systems", "primary_area": "", "supplementary_material": "", "author": "Bo Chang;Lili Meng;Eldad Haber;Frederick Tung;David Begert", "authorids": "bchang@stat.ubc.ca;lilimeng1103@gmail.com;haber@math.ubc.ca;ftung@sfu.ca;david@xtract.ai", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nchang2018multilevel,\ntitle={Multi-level Residual Networks from Dynamical Systems View},\nauthor={Bo Chang and Lili Meng and Eldad Haber and Frederick Tung and David Begert},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SyJS-OgR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;3;4", "rating_avg": 7.0, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 202, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13446938182344461290&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=SyJS-OgR-", "pdf": "https://openreview.net/pdf?id=SyJS-OgR-", "email": ";;;;", "author_num": 5 }, { "id": "SyKoKWbC-", "title": "Distributional Adversarial Networks", "track": "main", "status": "Workshop", "tldr": "We show that the mode collapse problem in GANs may be explained by a lack of information sharing between observations in a training batch, and propose a distribution-based framework for globally sharing information between gradients that leads to more stable and effective adversarial training.", "abstract": "In most current formulations of adversarial training, the discriminators can be expressed as single-input operators, that is, the mapping they define is separable over observations. In this work, we argue that this property might help explain the infamous mode collapse phenomenon in adversarially-trained generative models. Inspired by discrepancy measures and two-sample tests between probability distributions, we propose distributional adversaries that operate on samples, i.e., on sets of multiple points drawn from a distribution, rather than on single observations. We show how they can be easily implemented on top of existing models. Various experimental results show that generators trained in combination with our distributional adversaries are much more stable and are remarkably less prone to mode collapse than traditional models trained with observation-wise prediction discriminators. In addition, the application of our framework to domain adaptation results in strong improvement over recent state-of-the-art.", "keywords": "adversarial learning;generative model;domain adaptation;two-sample test", "primary_area": "", "supplementary_material": "", "author": "Chengtao Li;David Alvarez-Melis;Keyulu Xu;Stefanie Jegelka;Suvrit Sra", "authorids": "ctli@mit.edu;dalvmel@mit.edu;keyulu@mit.edu;stefje@csail.mit.edu;suvrit@mit.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nli2018distributional,\ntitle={Distributional Adversarial Networks},\nauthor={Chengtao Li and David Alvarez-Melis and Keyulu Xu and Stefanie Jegelka and Suvrit Sra},\nyear={2018},\nurl={https://openreview.net/forum?id=SyKoKWbC-},\n}", "github": "[![github](/images/github_icon.svg) ChengtaoLi/dan](https://github.com/ChengtaoLi/dan)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SyKoKWbC-", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;3;4", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 8, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4318799851448472619&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "SyL9u-WA-", "title": "Stabilizing Gradients for Deep Neural Networks via Efficient SVD Parameterization", "track": "main", "status": "Reject", "tldr": "To solve the gradient vanishing/exploding problems, we proprose an efficient parametrization of the transition matrix of RNN that loses no expressive power, converges faster and has good generalization.", "abstract": "Vanishing and exploding gradients are two of the main obstacles in training deep neural networks, especially in capturing long range dependencies in recurrent neural networks (RNNs). In this paper, we present an efficient parametrization of the transition matrix of an RNN that allows us to stabilize the gradients that arise in its training. Specifically, we parameterize the transition matrix by its singular value decomposition (SVD), which allows us to explicitly track and control its singular values. We attain efficiency by using tools that are common in numerical linear algebra, namely Householder reflectors for representing the orthogonal matrices that arise in the SVD. By explicitly controlling the singular values, our proposed svdRNN method allows us to easily solve the exploding gradient problem and we observe that it empirically solves the vanishing gradient issue to a large extent. We note that the SVD parameterization can be used for any rectangular weight matrix, hence it can be easily extended to any deep neural network, such as a multi-layer perceptron. Theoretically, we demonstrate that our parameterization does not lose any expressive power, and show how it potentially makes the optimization process easier. Our extensive experimental results also demonstrate that the proposed framework converges faster, and has good generalization, especially when the depth is large. \n", "keywords": "Recurrent Neural Network;Vanishing Gradient;Exploding Gradient;Linear Algebra;Householder Reflections", "primary_area": "", "supplementary_material": "", "author": "Jiong Zhang;Qi Lei;Inderjit S. Dhillon", "authorids": "zhangjiong724@utexas.edu;leiqi@ices.utexas.edu;inderjit@cs.utexas.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzhang2018stabilizing,\ntitle={Stabilizing Gradients for Deep Neural Networks via Efficient {SVD} Parameterization},\nauthor={Jiong Zhang and Qi Lei and Inderjit S. Dhillon},\nyear={2018},\nurl={https://openreview.net/forum?id=SyL9u-WA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SyL9u-WA-", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;3;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 148, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10623363336533108811&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 9 }, { "title": "Decoupling the Layers in Residual Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/332", "id": "SyMvJrdaW", "author_site": "Ricky Fok, Aijun An, Zana Rashidi, Xiaogang Wang", "tldr": "We propose the Warped Residual Network using a parallelizable warp operator for forward and backward propagation to distant layers that trains faster than the original residual neural network. ", "abstract": "We propose a Warped Residual Network (WarpNet) using a parallelizable warp operator for forward and backward propagation to distant layers that trains faster than the original residual neural network. We apply a perturbation theory on residual networks and decouple the interactions between residual units. The resulting warp operator is a first order approximation of the output over multiple layers. The first order perturbation theory exhibits properties such as binomial path lengths and exponential gradient scaling found experimentally by Veit et al (2016). \nWe demonstrate through an extensive performance study that the proposed network achieves comparable predictive performance to the original residual network with the same number of parameters, while achieving a significant speed-up on the total training time. As WarpNet performs model parallelism in residual network training in which weights are distributed over different GPUs, it offers speed-up and capability to train larger networks compared to original residual networks.", "keywords": "Warped residual networks;residual networks", "primary_area": "", "supplementary_material": "", "author": "Ricky Fok;Aijun An;Zana Rashidi;Xiaogang Wang", "authorids": "ricky.fok3@gmail.com;aan@cse.yorku.ca;rashidi.zana@gmail.com;stevenw@mathstat.yorku.ca", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nfok2018decoupling,\ntitle={Decoupling the Layers in Residual Networks},\nauthor={Ricky Fok and Aijun An and Zana Rashidi and Xiaogang Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SyMvJrdaW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;3;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.0, "replies_avg": 26, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14086525505046437085&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=SyMvJrdaW", "pdf": "https://openreview.net/pdf?id=SyMvJrdaW", "email": ";;;", "author_num": 4 }, { "title": "Adaptive Quantization of Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/271", "id": "SyOK1Sg0W", "author_site": "Soroosh Khoram, Jing Li", "tldr": "An adaptive method for fixed-point quantization of neural networks based on theoretical analysis rather than heuristics. ", "abstract": "Despite the state-of-the-art accuracy of Deep Neural Networks (DNN) in various classification problems, their deployment onto resource constrained edge computing devices remains challenging due to their large size and complexity. Several recent studies have reported remarkable results in reducing this complexity through quantization of DNN models. However, these studies usually do not consider the changes in the loss function when performing quantization, nor do they take the different importances of DNN model parameters to the accuracy into account. We address these issues in this paper by proposing a new method, called adaptive quantization, which simplifies a trained DNN model by finding a unique, optimal precision for each network parameter such that the increase in loss is minimized. The optimization problem at the core of this method iteratively uses the loss function gradient to determine an error margin for each parameter and assigns it a precision accordingly. Since this problem uses linear functions, it is computationally cheap and, as we will show, has a closed-form approximate solution. Experiments on MNIST, CIFAR, and SVHN datasets showed that the proposed method can achieve near or better than state-of-the-art reduction in model size with similar error rates. Furthermore, it can achieve compressions close to floating-point model compression methods without loss of accuracy.", "keywords": "Deep Neural Networks;Model Quantization;Model Compression", "primary_area": "", "supplementary_material": "", "author": "Soroosh Khoram;Jing Li", "authorids": "khoram@wisc.edu;jli@ece.wisc.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nkhoram2018adaptive,\ntitle={Adaptive Quantization of Neural Networks},\nauthor={Soroosh Khoram and Jing Li},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SyOK1Sg0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;4;4", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 77, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10539773684960766210&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=SyOK1Sg0W", "pdf": "https://openreview.net/pdf?id=SyOK1Sg0W", "email": ";", "author_num": 2 }, { "id": "SyPMT6gAb", "title": "Variance Regularized Counterfactual Risk Minimization via Variational Divergence Minimization", "track": "main", "status": "Reject", "tldr": "For off-policy learning with bandit feedbacks, we propose a new variance regularized counterfactual learning algorithm, which has both theoretical foundations and superior empirical performance.", "abstract": "Off-policy learning, the task of evaluating and improving policies using historic data collected from a logging policy, is important because on-policy evaluation is usually expensive and has adverse impacts. One of the major challenge of off-policy learning is to derive counterfactual estimators that also has low variance and thus low generalization error. \nIn this work, inspired by learning bounds for importance sampling problems, we present a new counterfactual learning principle for off-policy learning with bandit feedbacks.Our method regularizes the generalization error by minimizing the distribution divergence between the logging policy and the new policy, and removes the need for iterating through all training samples to compute sample variance regularization in prior work. With neural network policies, our end-to-end training algorithms using variational divergence minimization showed significant improvement over conventional baseline algorithms and is also consistent with our theoretical results.", "keywords": "Counterfactual Inference;Off-Policy Learning;Variance Regularization", "primary_area": "", "supplementary_material": "", "author": "Hang Wu;Hang Wu", "authorids": "hwu340@gatech.edu;hangwu@gatech.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nwu2018variance,\ntitle={Variance Regularized Counterfactual Risk Minimization via Variational Divergence Minimization},\nauthor={Hang Wu},\nyear={2018},\nurl={https://openreview.net/forum?id=SyPMT6gAb},\n}", "github": "[![github](/images/github_icon.svg) hang-wu/VRCRM](https://github.com/hang-wu/VRCRM)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SyPMT6gAb", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;5;3", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": -0.6546536707079772, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16906275230514657049&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "title": "The power of deeper networks for expressing natural functions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/39", "id": "SyProzZAW", "author_site": "David Rolnick, Max Tegmark", "tldr": "We prove that deep neural networks are exponentially more efficient than shallow ones at approximating sparse multivariate polynomials.", "abstract": "It is well-known that neural networks are universal approximators, but that deeper networks tend in practice to be more powerful than shallower ones. We shed light on this by proving that the total number of neurons m required to approximate natural classes of multivariate polynomials of n variables grows only linearly with n for deep neural networks, but grows exponentially when merely a single hidden layer is allowed. We also provide evidence that when the number of hidden layers is increased from 1 to k, the neuron requirement grows exponentially not with n but with n^{1/k}, suggesting that the minimum number of layers required for practical expressibility grows only logarithmically with n.", "keywords": "expressivity of neural networks;depth of neural networks;universal approximators;function approximation;deep learning", "primary_area": "", "supplementary_material": "", "author": "David Rolnick;Max Tegmark", "authorids": "drolnick@mit.edu;tegmark@mit.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nrolnick2018the,\ntitle={The power of deeper networks for expressing natural functions},\nauthor={David Rolnick and Max Tegmark},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SyProzZAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 242, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11248399658974838640&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=SyProzZAW", "pdf": "https://openreview.net/pdf?id=SyProzZAW", "email": ";", "author_num": 2 }, { "id": "SySaJ0xCZ", "title": "Simple and efficient architecture search for Convolutional Neural Networks", "track": "main", "status": "Workshop", "tldr": "We propose a simple and efficent method for architecture search for convolutional neural networks.", "abstract": "Neural networks have recently had a lot of success for many tasks. However, neural\nnetwork architectures that perform well are still typically designed manually\nby experts in a cumbersome trial-and-error process. We propose a new method\nto automatically search for well-performing CNN architectures based on a simple\nhill climbing procedure whose operators apply network morphisms, followed\nby short optimization runs by cosine annealing. Surprisingly, this simple method\nyields competitive results, despite only requiring resources in the same order of\nmagnitude as training a single network. E.g., on CIFAR-10, our method designs\nand trains networks with an error rate below 6% in only 12 hours on a single GPU;\ntraining for one day reduces this error further, to almost 5%.", "keywords": "Deep Learning;Hyperparameter Optimization;Architecture Search;Convolutional Neural Networks;Network Morphism;Network Transformation;SGDR;Cosine annealing;hill climbing", "primary_area": "", "supplementary_material": "", "author": "Thomas Elsken;Jan Hendrik Metzen;Frank Hutter", "authorids": "thomas.elsken@de.bosch.com;janhendrik.metzen@de.bosch.com;fh@cs.uni-freiburg.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nelsken2018simple,\ntitle={Simple and efficient architecture search for Convolutional Neural Networks},\nauthor={Thomas Elsken and Jan Hendrik Metzen and Frank Hutter},\nyear={2018},\nurl={https://openreview.net/forum?id=SySaJ0xCZ},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=SySaJ0xCZ)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SySaJ0xCZ", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;5;4", "rating_avg": 5.0, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 323, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10705593020526188892&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SySisz-CW", "title": "On the difference between building and extracting patterns: a causal analysis of deep generative models.", "track": "main", "status": "Reject", "tldr": "We use causal inference to characterise the architecture of generative models", "abstract": "Generative models are important tools to capture and investigate the properties of complex empirical data. Recent developments such as Generative Adversarial Networks (GANs) and Variational Auto-Encoders (VAEs) use two very similar, but \\textit{reverse}, deep convolutional architectures, one to generate and one to extract information from data. Does learning the parameters of both architectures obey the same rules? We exploit the causality principle of independence of mechanisms to quantify how the weights of successive layers adapt to each other. Using the recently introduced Spectral Independence Criterion, we quantify the dependencies between the kernels of successive convolutional layers and show that those are more independent for the generative process than for information extraction, in line with results from the field of causal inference. In addition, our experiments on generation of human faces suggest that more independence between successive layers of generators results in improved performance of these architectures.\n", "keywords": "GAN;VAE;causality", "primary_area": "", "supplementary_material": "", "author": "Michel Besserve;Dominik Janzing;Bernhard Schoelkopf", "authorids": "michel.besserve@tuebingen.mpg.de;dominik.janzing@tuebingen.mpg.de;bs@tuebingen.mpg.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbesserve2018on,\ntitle={On the difference between building and extracting patterns: a causal analysis of deep generative models.},\nauthor={Michel Besserve and Dominik Janzing and Bernhard Schoelkopf},\nyear={2018},\nurl={https://openreview.net/forum?id=SySisz-CW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SySisz-CW", "pdf_size": 0, "rating": "2;7;7", "confidence": "4;2;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.0, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844387, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3VUT-1LXIRUJ:scholar.google.com/&scioq=On+the+difference+between+building+and+extracting+patterns:+a+causal+analysis+of+deep+generative+models.&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SySpa-Z0Z", "title": "From Information Bottleneck To Activation Norm Penalty", "track": "main", "status": "Reject", "tldr": "We derive a norm penalty on the output of the neural network from the information bottleneck perspective", "abstract": "Many regularization methods have been proposed to prevent overfitting in neural networks. Recently, a regularization method has been proposed to optimize the variational lower bound of the Information Bottleneck Lagrangian. However, this method cannot be generalized to regular neural network architectures. We present the activation norm penalty that is derived from the information bottleneck principle and is theoretically grounded in a variation dropout framework. Unlike in previous literature, it can be applied to any general neural network. We demonstrate that this penalty can give consistent improvements to different state of the art architectures both in language modeling and image classification. We present analyses on the properties of this penalty and compare it to other methods that also reduce mutual information.", "keywords": "Deep Learning;Natural Language Processing", "primary_area": "", "supplementary_material": "", "author": "Allen Nie;Mihir Mongia;James Zou", "authorids": "anie@stanford.edu;mihir.mongia@mssm.edu;jamesz@stanford.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nnie2018from,\ntitle={From Information Bottleneck To Activation Norm Penalty},\nauthor={Allen Nie and Mihir Mongia and James Zou},\nyear={2018},\nurl={https://openreview.net/forum?id=SySpa-Z0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SySpa-Z0Z", "pdf_size": 0, "rating": "4;4;7", "confidence": "3;4;3", "rating_avg": 5.0, "confidence_avg": 3.3333333333333335, "replies_avg": 6, "authors#_avg": 3, "corr_rating_confidence": -0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:X1duK9Hqv-cJ:scholar.google.com/&scioq=From+Information+Bottleneck+To+Activation+Norm+Penalty&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SyUkxxZ0b", "title": "Adversarial Spheres", "track": "main", "status": "Workshop", "tldr": "We hypothesize that the vulnerability of image models to small adversarial perturbation is a naturally occurring result of the high dimensional geometry of the data manifold. We explore and theoretically prove this hypothesis for a simple synthetic dataset.", "abstract": " State of the art computer vision models have been shown to be vulnerable to small adversarial perturbations of the input. In other words, most images in the data distribution are both correctly classified by the model and are very close to a visually similar misclassified image. Despite substantial research interest, the cause of the phenomenon is still poorly understood and remains unsolved. We hypothesize that this counter intuitive behavior is a naturally occurring result of the high dimensional geometry of the data manifold. As a first step towards exploring this hypothesis, we study a simple synthetic dataset of classifying between two concentric high dimensional spheres. For this dataset we show a fundamental tradeoff between the amount of test error and the average distance to nearest error. In particular, we prove that any model which misclassifies a small constant fraction of a sphere will be vulnerable to adversarial perturbations of size $O(1/\\sqrt{d})$. Surprisingly, when we train several different architectures on this dataset, all of their error sets naturally approach this theoretical bound. As a result of the theory, the vulnerability of neural networks to small adversarial perturbations is a logical consequence of the amount of test error observed. We hope that our theoretical analysis of this very simple case will point the way forward to explore how the geometry of complex real-world data sets leads to adversarial examples.", "keywords": "Adversarial Examples;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Justin Gilmer;Luke Metz;Fartash Faghri;Sam Schoenholz;Maithra Raghu;Martin Wattenberg;Ian Goodfellow", "authorids": ";;fartash.faghri@google.com;;maithra@google.com;;goodfellow@google.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\ngilmer2018adversarial,\ntitle={Adversarial Spheres},\nauthor={Justin Gilmer and Luke Metz and Fartash Faghri and Sam Schoenholz and Maithra Raghu and Martin Wattenberg and Ian Goodfellow},\nyear={2018},\nurl={https://openreview.net/forum?id=SyUkxxZ0b},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=SyUkxxZ0b)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SyUkxxZ0b", "pdf_size": 0, "rating": "3;4;5", "confidence": "3;4;3", "rating_avg": 4.0, "confidence_avg": 3.3333333333333335, "replies_avg": 6, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 436, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16463129449367579533&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "SyVOjfbRb", "title": "LSH-SAMPLING BREAKS THE COMPUTATIONAL CHICKEN-AND-EGG LOOP IN ADAPTIVE STOCHASTIC GRADIENT ESTIMATION", "track": "main", "status": "Workshop", "tldr": "We improve the running of all existing gradient descent algorithms.", "abstract": "Stochastic Gradient Descent or SGD is the most popular optimization algorithm for large-scale problems. SGD estimates the gradient by uniform sampling with sample size one. There have been several other works that suggest faster epoch wise convergence by using weighted non-uniform sampling for better gradient estimates. Unfortunately, the per-iteration cost of maintaining this adaptive distribution for gradient estimation is more than calculating the full gradient. As a result, the false impression of faster convergence in iterations leads to slower convergence in time, which we call a chicken-and-egg loop. In this paper, we break this barrier by providing the first demonstration of a sampling scheme, which leads to superior gradient estimation, while keeping the sampling cost per iteration similar to that of the uniform sampling. Such an algorithm is possible due to the sampling view of Locality Sensitive Hashing (LSH), which came to light recently. As a consequence of superior and fast estimation, we reduce the running time of all existing gradient descent algorithms. We demonstrate the benefits of our proposal on both SGD and AdaGrad.", "keywords": "Stochastic Gradient Descent;Optimization;Sampling;Estimation", "primary_area": "", "supplementary_material": "", "author": "Beidi Chen;Yingchen Xu;Anshumali Shrivastava", "authorids": "beidi.chen@rice.edu;yingchen.xu@rice.edu;anshumali@rice.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nchen2018lshsampling,\ntitle={{LSH}-{SAMPLING} {BREAKS} {THE} {COMPUTATIONAL} {CHICKEN}-{AND}-{EGG} {LOOP} {IN} {ADAPTIVE} {STOCHASTIC} {GRADIENT} {ESTIMATION}},\nauthor={Beidi Chen and Yingchen Xu and Anshumali Shrivastava},\nyear={2018},\nurl={https://openreview.net/forum?id=SyVOjfbRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SyVOjfbRb", "pdf_size": 0, "rating": "4;4;8", "confidence": "5;5;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.666666666666667, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": -1.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12992592929726991567&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "SyVVXngRW", "title": "Deep Asymmetric Multi-task Feature Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose Deep Asymmetric Multitask Feature Learning (Deep-AMTFL) which can learn deep representations shared across multiple tasks while effectively preventing negative transfer that may happen in the feature sharing process. Specifically, we introduce an asymmetric autoencoder term that allows reliable predictors for the easy tasks to have high contribution to the feature learning while suppressing the influences of unreliable predictors for more difficult tasks. This allows the learning of less noisy representations, and enables unreliable predictors to exploit knowledge from the reliable predictors via the shared latent features. Such asymmetric knowledge transfer through shared features is also more scalable and efficient than inter-task asymmetric transfer. We validate our Deep-AMTFL model on multiple benchmark datasets for multitask learning and image classification, on which it significantly outperforms existing symmetric and asymmetric multitask learning models, by effectively preventing negative transfer in deep feature learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hae Beom Lee;Eunho Yang;Sung Ju Hwang", "authorids": "hblee@unist.ac.kr;yangeh@gmail.com;sjhwang@unist.ac.kr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbeom2018deep,\ntitle={Deep Asymmetric Multi-task Feature Learning},\nauthor={Hae Beom Lee and Eunho Yang and Sung Ju Hwang},\nyear={2018},\nurl={https://openreview.net/forum?id=SyVVXngRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SyVVXngRW", "pdf_size": 0, "rating": "3;5;6", "confidence": "4;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7786406609741351183&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "SyW4Gjg0W", "title": "Kernel Graph Convolutional Neural Nets", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph kernels have been successfully applied to many graph classification problems. Typically, a kernel is first designed, and then an SVM classifier is trained based on the features defined implicitly by this kernel. This two-stage approach decouples data representation from learning, which is suboptimal. On the other hand, Convolutional Neural Networks (CNNs) have the capability to learn their own features directly from the raw data during training. Unfortunately, they cannot handle irregular data such as graphs. We address this challenge by using graph kernels to embed meaningful local neighborhoods of the graphs in a continuous vector space. A set of filters is then convolved with these patches, pooled, and the output is then passed to a feedforward network. With limited parameter tuning, our approach outperforms strong baselines on 7 out of 10 benchmark datasets, and reaches comparable performance elsewhere. Code and data are publicly available.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Giannis Nikolentzos;Polykarpos Meladianos;Antoine J-P Tixier;Konstantinos Skianis;Michalis Vazirgiannis", "authorids": "giannisnik@hotmail.com;pmeladianos@aueb.gr;antoine.tixier-1@colorado.edu;kskianis@lix.polytechnique.fr;mvazirg@lix.polytechnique.fr", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nnikolentzos2018kernel,\ntitle={Kernel Graph Convolutional Neural Nets},\nauthor={Giannis Nikolentzos and Polykarpos Meladianos and Antoine J-P Tixier and Konstantinos Skianis and Michalis Vazirgiannis},\nyear={2018},\nurl={https://openreview.net/forum?id=SyW4Gjg0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SyW4Gjg0W", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;5;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.666666666666667, "replies_avg": 4, "authors#_avg": 5, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7qXg1owpaRwJ:scholar.google.com/&scioq=Kernel+Graph+Convolutional+Neural+Nets&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SyWvxWYQf", "title": "Withdrawn", "track": "main", "status": "Withdraw", "tldr": ".", "abstract": "withdrawn", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "withdrawn.", "authorids": "withdrawn", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SyWvxWYQf", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;5;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 3, "authors#_avg": 1, "corr_rating_confidence": -0.4999999999999999, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "META LEARNING SHARED HIERARCHIES", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/149", "id": "SyX0IeWAW", "author_site": "Kevin Frans, Jonathan Ho, , Pieter Abbeel, John Schulman", "tldr": "learn hierarchal sub-policies through end-to-end training over a distribution of tasks", "abstract": "We develop a metalearning approach for learning hierarchically structured poli- cies, improving sample efficiency on unseen tasks through the use of shared primitives\u2014policies that are executed for large numbers of timesteps. Specifi- cally, a set of primitives are shared within a distribution of tasks, and are switched between by task-specific policies. We provide a concrete metric for measuring the strength of such hierarchies, leading to an optimization problem for quickly reaching high reward on unseen tasks. We then present an algorithm to solve this problem end-to-end through the use of any off-the-shelf reinforcement learning method, by repeatedly sampling new tasks and resetting task-specific policies. We successfully discover meaningful motor primitives for the directional movement of four-legged robots, solely by interacting with distributions of mazes. We also demonstrate the transferability of primitives to solve long-timescale sparse-reward obstacle courses, and we enable 3D humanoid robots to robustly walk and crawl with the same policy.", "keywords": "hierarchal reinforcement learning;meta-learning", "primary_area": "", "supplementary_material": "", "author": "Kevin Frans;Jonathan Ho;Xi Chen;Pieter Abbeel;John Schulman", "authorids": "kevinfrans2@gmail.com;jonathanho@berkeley.edu;c.xi@eecs.berkeley.edu;pabbeel@cs.berkeley.edu;joschu@openai.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nfrans2018meta,\ntitle={{META} {LEARNING} {SHARED} {HIERARCHIES}},\nauthor={Kevin Frans and Jonathan Ho and Xi Chen and Pieter Abbeel and John Schulman},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SyX0IeWAW},\n}", "github": "[![github](/images/github_icon.svg) openai/mlsh](https://github.com/openai/mlsh) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=SyX0IeWAW)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;3;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 17, "authors#_avg": 5, "corr_rating_confidence": -0.9449111825230683, "gs_citation": 456, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8366113293045727240&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=SyX0IeWAW", "pdf": "https://openreview.net/pdf?id=SyX0IeWAW", "email": ";;;;", "author_num": 5 }, { "id": "SyXNErg0W", "title": "Softmax Supervision with Isotropic Normalization", "track": "main", "status": "Reject", "tldr": "The discriminative capability of softmax for learning feature vectors of objects is effectively enhanced by virture of isotropic normalization on global distribution of data points.", "abstract": "The softmax function is widely used to train deep neural networks for multi-class classification. Despite its outstanding performance in classification tasks, the features derived from the supervision of softmax are usually sub-optimal in some scenarios where Euclidean distances apply in feature spaces. To address this issue, we propose a new loss, dubbed the isotropic loss, in the sense that the overall distribution of data points is regularized to approach the isotropic normal one. Combined with the vanilla softmax, we formalize a novel criterion called the isotropic softmax, or isomax for short, for supervised learning of deep neural networks. By virtue of the isomax, the intra-class features are penalized by the isotropic loss while inter-class distances are well kept by the original softmax loss. Moreover, the isomax loss does not require any additional modifications to the network, mini-batches or the training process. Extensive experiments on classification and clustering are performed to demonstrate the superiority and robustness of the isomax loss.", "keywords": "softmax;center loss;triplet loss;convolution neural network;supervised learning", "primary_area": "", "supplementary_material": "", "author": "Yue Zhao;Deli Zhao;Shaohua Wan;Bo Zhang", "authorids": "tigerzhaoyue@outlook.com;zhaodeli@gmail.com;zhangbo@xiaomi.com;wanshaohua@xiaomi.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhao2018softmax,\ntitle={Softmax Supervision with Isotropic Normalization},\nauthor={Yue Zhao and Deli Zhao and Shaohua Wan and Bo Zhang},\nyear={2018},\nurl={https://openreview.net/forum?id=SyXNErg0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SyXNErg0W", "pdf_size": 0, "rating": "4;4;6", "confidence": "5;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4717249774202563776&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "SyYYPdg0-", "title": "Counterfactual Image Networks", "track": "main", "status": "Reject", "tldr": "Weakly-supervised image segmentation using compositional structure of images and generative models.", "abstract": "We capitalize on the natural compositional structure of images in order to learn object segmentation with weakly labeled images. The intuition behind our approach is that removing objects from images will yield natural images, however removing random patches will yield unnatural images. We leverage this signal to develop a generative model that decomposes an image into layers, and when all layers are combined, it reconstructs the input image. However, when a layer is removed, the model learns to produce a different image that still looks natural to an adversary, which is possible by removing objects. Experiments and visualizations suggest that this model automatically learns object segmentation on images labeled only by scene better than baselines.", "keywords": "computer vision;image segmentation;generative models;adversarial networks;unsupervised learning", "primary_area": "", "supplementary_material": "", "author": "Deniz Oktay;Carl Vondrick;Antonio Torralba", "authorids": "denizokt@mit.edu;vondrick@google.com;torralba@mit.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\noktay2018counterfactual,\ntitle={Counterfactual Image Networks},\nauthor={Deniz Oktay and Carl Vondrick and Antonio Torralba},\nyear={2018},\nurl={https://openreview.net/forum?id=SyYYPdg0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SyYYPdg0-", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16771943422917386182&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Deep Bayesian Bandits Showdown: An Empirical Comparison of Bayesian Deep Networks for Thompson Sampling", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/178", "id": "SyYe6k-CW", "author_site": "Carlos Riquelme, George Tucker, Jasper Snoek", "tldr": "An Empirical Comparison of Bayesian Deep Networks for Thompson Sampling", "abstract": "Recent advances in deep reinforcement learning have made significant strides in performance on applications such as Go and Atari games. However, developing practical methods to balance exploration and exploitation in complex domains remains largely unsolved. Thompson Sampling and its extension to reinforcement learning provide an elegant approach to exploration that only requires access to posterior samples of the model. At the same time, advances in approximate Bayesian methods have made posterior approximation for flexible neural network models practical. Thus, it is attractive to consider approximate Bayesian neural networks in a Thompson Sampling framework. To understand the impact of using an approximate posterior on Thompson Sampling, we benchmark well-established and recently developed methods for approximate posterior sampling combined with Thompson Sampling over a series of contextual bandit problems. We found that many approaches that have been successful in the supervised learning setting underperformed in the sequential decision-making scenario. In particular, we highlight the challenge of adapting slowly converging uncertainty estimates to the online setting.", "keywords": "exploration;Thompson Sampling;Bayesian neural networks;bandits;reinforcement learning;variational inference;Monte Carlo", "primary_area": "", "supplementary_material": "", "author": "Carlos Riquelme;George Tucker;Jasper Snoek", "authorids": "rikel@google.com;gjt@google.com;jsnoek@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nriquelme2018deep,\ntitle={Deep Bayesian Bandits Showdown: An Empirical Comparison of Bayesian Deep Networks for Thompson Sampling},\nauthor={Carlos Riquelme and George Tucker and Jasper Snoek},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SyYe6k-CW},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 4 community implementations](https://paperswithcode.com/paper/?openreview=SyYe6k-CW)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;6;7", "confidence": "5;4;4", "rating_avg": 6.0, "confidence_avg": 4.333333333333333, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 435, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15082403977285558392&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=SyYe6k-CW", "pdf": "https://openreview.net/pdf?id=SyYe6k-CW", "email": ";;", "author_num": 3 }, { "title": "Decision-Based Adversarial Attacks: Reliable Attacks Against Black-Box Machine Learning Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/19", "id": "SyZI0GWCZ", "author_site": "Wieland Brendel, Jonas Rauber, Matthias Bethge", "tldr": "A novel adversarial attack that can directly attack real-world black-box machine learning models without transfer.", "abstract": "Many machine learning algorithms are vulnerable to almost imperceptible perturbations of their inputs. So far it was unclear how much risk adversarial perturbations carry for the safety of real-world machine learning applications because most methods used to generate such perturbations rely either on detailed model information (gradient-based attacks) or on confidence scores such as class probabilities (score-based attacks), neither of which are available in most real-world scenarios. In many such cases one currently needs to retreat to transfer-based attacks which rely on cumbersome substitute models, need access to the training data and can be defended against. Here we emphasise the importance of attacks which solely rely on the final model decision. Such decision-based attacks are (1) applicable to real-world black-box models such as autonomous cars, (2) need less knowledge and are easier to apply than transfer-based attacks and (3) are more robust to simple defences than gradient- or score-based attacks. Previous attacks in this category were limited to simple models or simple datasets. Here we introduce the Boundary Attack, a decision-based attack that starts from a large adversarial perturbation and then seeks to reduce the perturbation while staying adversarial. The attack is conceptually simple, requires close to no hyperparameter tuning, does not rely on substitute models and is competitive with the best gradient-based attacks in standard computer vision tasks like ImageNet. We apply the attack on two black-box algorithms from Clarifai.com. The Boundary Attack in particular and the class of decision-based attacks in general open new avenues to study the robustness of machine learning models and raise new questions regarding the safety of deployed machine learning systems. An implementation of the attack is available as part of Foolbox (https://github.com/bethgelab/foolbox).", "keywords": "adversarial attacks;adversarial examples;adversarials;robustness;security", "primary_area": "", "supplementary_material": "", "author": "Wieland Brendel *;Jonas Rauber *;Matthias Bethge", "authorids": "wieland.brendel@bethgelab.org;jonas.rauber@bethgelab.org;matthias.bethge@bethgelab.org", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nbrendel2018decisionbased,\ntitle={Decision-Based Adversarial Attacks: Reliable Attacks Against Black-Box Machine Learning Models},\nauthor={Wieland Brendel * and Jonas Rauber * and Matthias Bethge},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SyZI0GWCZ},\n}", "github": "[![github](/images/github_icon.svg) bethgelab/foolbox](https://github.com/bethgelab/foolbox) + [![Papers with Code](/images/pwc_icon.svg) 5 community implementations](https://paperswithcode.com/paper/?openreview=SyZI0GWCZ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;4;3", "rating_avg": 7.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 15, "authors#_avg": 3, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 1732, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1222517566911879461&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=SyZI0GWCZ", "pdf": "https://openreview.net/pdf?id=SyZI0GWCZ", "email": ";;", "author_num": 3 }, { "title": "Distributed Distributional Deterministic Policy Gradients", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/25", "id": "SyZipzbCb", "author_site": "Gabriel Barth-maron, Matthew Hoffman, David Budden, Will Dabney, Daniel Horgan, Dhruva Tirumala, Alistair Muldal, Nicolas Heess, Timothy Lillicrap", "tldr": "We develop an agent that we call the Distributional Deterministic Deep Policy Gradient algorithm, which achieves state of the art performance on a number of challenging continuous control problems.", "abstract": "This work adopts the very successful distributional perspective on reinforcement learning and adapts it to the continuous control setting. We combine this within a distributed framework for off-policy learning in order to develop what we call the Distributed Distributional Deep Deterministic Policy Gradient algorithm, D4PG. We also combine this technique with a number of additional, simple improvements such as the use of N-step returns and prioritized experience replay. Experimentally we examine the contribution of each of these individual components, and show how they interact, as well as their combined contributions. Our results show that across a wide variety of simple control tasks, difficult manipulation tasks, and a set of hard obstacle-based locomotion tasks the D4PG algorithm achieves state of the art performance.", "keywords": "policy gradient;continuous control;actor critic;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Gabriel Barth-Maron;Matthew W. Hoffman;David Budden;Will Dabney;Dan Horgan;Dhruva TB;Alistair Muldal;Nicolas Heess;Timothy Lillicrap", "authorids": "gabrielbm@google.com;mwhoffman@google.com;budden@google.com;wdabney@google.com;horgan@google.com;dhruvat@google.com;alimuldal@google.com;heess@google.com;countzero@google.com", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "@inproceedings{\nbarth-maron2018distributional,\ntitle={Distributional Policy Gradients},\nauthor={Gabriel Barth-Maron and Matthew W. Hoffman and David Budden and Will Dabney and Dan Horgan and Dhruva TB and Alistair Muldal and Nicolas Heess and Timothy Lillicrap},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SyZipzbCb},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 4 community implementations](https://paperswithcode.com/paper/?openreview=SyZipzbCb)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "5;6;9", "confidence": "4;5;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 12, "authors#_avg": 9, "corr_rating_confidence": -0.2773500981126146, "gs_citation": 726, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13940862836810359018&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=SyZipzbCb", "pdf": "https://openreview.net/pdf?id=SyZipzbCb", "email": ";;;;;;;;", "author_num": 9 }, { "id": "Sy_MK3lAZ", "title": "PARAMETRIZED DEEP Q-NETWORKS LEARNING: PLAYING ONLINE BATTLE ARENA WITH DISCRETE-CONTINUOUS HYBRID ACTION SPACE", "track": "main", "status": "Reject", "tldr": "A DQN and DDPG hybrid algorithm is proposed to deal with the discrete-continuous hybrid action space.", "abstract": "Most existing deep reinforcement learning (DRL) frameworks consider action spaces that are either\ndiscrete or continuous space. Motivated by the project of design Game AI for King of Glory\n(KOG), one the world\u2019s most popular mobile game, we consider the scenario with the discrete-continuous\nhybrid action space. To directly apply existing DLR frameworks, existing approaches\neither approximate the hybrid space by a discrete set or relaxing it into a continuous set, which is\nusually less efficient and robust. In this paper, we propose a parametrized deep Q-network (P-DQN)\nfor the hybrid action space without approximation or relaxation. Our algorithm combines DQN and\nDDPG and can be viewed as an extension of the DQN to hybrid actions. The empirical study on the\ngame KOG validates the efficiency and effectiveness of our method.", "keywords": "Deep reinforcement learning;Hybrid action space;DQN;DDPG", "primary_area": "", "supplementary_material": "", "author": "Jiechao Xiong;Qing Wang;Zhuoran Yang;Peng Sun;Yang Zheng;Lei Han;Haobo Fu;Xiangru Lian;Carson Eisenach;Haichuan Yang;Emmanuel Ekwedike;Bei Peng;Haoyue Gao;Tong Zhang;Ji Liu;Han Liu", "authorids": ";;;;;;haobofu@tencent.com;;;;;;;;;", "gender": ";;;;;;;;;;;;;;;", "homepage": ";;;;;;;;;;;;;;;", "dblp": ";;;;;;;;;;;;;;;", "google_scholar": ";;;;;;;;;;;;;;;", "orcid": ";;;;;;;;;;;;;;;", "linkedin": ";;;;;;;;;;;;;;;", "or_profile": ";;;;;;;;;;;;;;;", "aff": ";;;;;;;;;;;;;;;", "aff_domain": ";;;;;;;;;;;;;;;", "position": ";;;;;;;;;;;;;;;", "bibtex": "@misc{\nxiong2018parametrized,\ntitle={{PARAMETRIZED} {DEEP} Q-{NETWORKS} {LEARNING}: {PLAYING} {ONLINE} {BATTLE} {ARENA} {WITH} {DISCRETE}-{CONTINUOUS} {HYBRID} {ACTION} {SPACE}},\nauthor={Jiechao Xiong and Qing Wang and Zhuoran Yang and Peng Sun and Yang Zheng and Lei Han and Haobo Fu and Xiangru Lian and Carson Eisenach and Haichuan Yang and Emmanuel Ekwedike and Bei Peng and Haoyue Gao and Tong Zhang and Ji Liu and Han Liu},\nyear={2018},\nurl={https://openreview.net/forum?id=Sy_MK3lAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Sy_MK3lAZ", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 16, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4069342908746964806&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0 }, { "id": "SybqeKgA-", "title": "On Batch Adaptive Training for Deep Learning: Lower Loss and Larger Step Size", "track": "main", "status": "Reject", "tldr": "We developed a batch adaptive momentum that can achieve lower loss compared with mini-batch methods after scanning same epochs of data, and it is more robust against large step size.", "abstract": "Mini-batch gradient descent and its variants are commonly used in deep learning. The principle of mini-batch gradient descent is to use noisy gradient calculated on a batch to estimate the real gradient, thus balancing the computation cost per iteration and the uncertainty of noisy gradient. However, its batch size is a fixed hyper-parameter requiring manual setting before training the neural network. Yin et al. (2017) proposed a batch adaptive stochastic gradient descent (BA-SGD) that can dynamically choose a proper batch size as learning proceeds. We extend the BA-SGD to momentum algorithm and evaluate both the BA-SGD and the batch adaptive momentum (BA-Momentum) on two deep learning tasks from natural language processing to image classification. Experiments confirm that batch adaptive methods can achieve a lower loss compared with mini-batch methods after scanning the same epochs of data. Furthermore, our BA-Momentum is more robust against larger step sizes, in that it can dynamically enlarge the batch size to reduce the larger uncertainty brought by larger step sizes. We also identified an interesting phenomenon, batch size boom. The code implementing batch adaptive framework is now open source, applicable to any gradient-based optimization problems.", "keywords": "deep learning;optimization", "primary_area": "", "supplementary_material": "", "author": "Runyao Chen;Kun Wu;Ping Luo", "authorids": "chenrunyao14@mails.ucas.ac.cn;WuKun14@mails.ucas.ac.cn;luop@ict.ac.cn", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nchen2018on,\ntitle={On Batch Adaptive Training for Deep Learning: Lower Loss and Larger Step Size},\nauthor={Runyao Chen and Kun Wu and Ping Luo},\nyear={2018},\nurl={https://openreview.net/forum?id=SybqeKgA-},\n}", "github": "[![github](/images/github_icon.svg) thomasyao3096/Batch_Adaptive_Framework](https://github.com/thomasyao3096/Batch_Adaptive_Framework)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SybqeKgA-", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;3;4", "rating_avg": 4.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dOZONhJj0QEJ:scholar.google.com/&scioq=On+Batch+Adaptive+Training+for+Deep+Learning:+Lower+Loss+and+Larger+Step+Size&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SyfiiMZA-", "title": "Jointly Learning to Construct and Control Agents using Deep Reinforcement Learning", "track": "main", "status": "Workshop", "tldr": "Use deep reinforcement learning to design the physical attributes of a robot jointly with a control policy.", "abstract": "The physical design of a robot and the policy that controls its motion are inherently coupled. However, existing approaches largely ignore this coupling, instead choosing to alternate between separate design and control phases, which requires expert intuition throughout and risks convergence to suboptimal designs. In this work, we propose a method that jointly optimizes over the physical design of a robot and the corresponding control policy in a model-free fashion, without any need for expert supervision. Given an arbitrary robot morphology, our method maintains a distribution over the design parameters and uses reinforcement learning to train a neural network controller. Throughout training, we refine the robot distribution to maximize the expected reward. This results in an assignment to the robot parameters and neural network policy that are jointly optimal. We evaluate our approach in the context of legged locomotion, and demonstrate that it discovers novel robot designs and walking gaits for several different morphologies, achieving performance comparable to or better than that of hand-crafted designs.", "keywords": "robot locomotion;reinforcement learning;policy gradients;physical design;deep learning", "primary_area": "", "supplementary_material": "", "author": "Charles Schaff;David Yunis;Ayan Chakrabarti;Matthew R. Walter", "authorids": "cbschaff@ttic.edu;dyunis@uchicago.edu;ayan@wustl.edu;mwalter@ttic.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nschaff2018jointly,\ntitle={Jointly Learning to Construct and Control Agents using Deep Reinforcement Learning},\nauthor={Charles Schaff and David Yunis and Ayan Chakrabarti and Matthew R. Walter},\nyear={2018},\nurl={https://openreview.net/forum?id=SyfiiMZA-},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=SyfiiMZA-)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=SyfiiMZA-", "pdf_size": 0, "rating": "4;5;9", "confidence": "4;3;5", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": 0.7559289460184544, "gs_citation": 134, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16524130189701551253&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11 }, { "title": "Go for a Walk and Arrive at the Answer: Reasoning Over Paths in Knowledge Bases using Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/50", "id": "Syg-YfWCW", "author_site": "Rajarshi Das, Shehzaad Dhuliawala, Manzil Zaheer, Luke Vilnis, Ishan Durugkar, Akshay Krishnamurthy, Alex Smola, Andrew McCallum", "tldr": "We present a RL agent MINERVA which learns to walk on a knowledge graph and answer queries", "abstract": "Knowledge bases (KB), both automatically and manually constructed, are often incomplete --- many valid facts can be inferred from the KB by synthesizing existing information. A popular approach to KB completion is to infer new relations by combinatory reasoning over the information found along other paths connecting a pair of entities. Given the enormous size of KBs and the exponential number of paths, previous path-based models have considered only the problem of predicting a missing relation given two entities, or evaluating the truth of a proposed triple. Additionally, these methods have traditionally used random paths between fixed entity pairs or more recently learned to pick paths between them. We propose a new algorithm, MINERVA, which addresses the much more difficult and practical task of answering questions where the relation is known, but only one entity. Since random walks are impractical in a setting with unknown destination and combinatorially many paths from a start node, we present a neural reinforcement learning approach which learns how to navigate the graph conditioned on the input query to find predictive paths. On a comprehensive evaluation on seven knowledge base datasets, we found MINERVA to be competitive with many current state-of-the-art methods. ", "keywords": "Knowledge Graphs;Reinforcement Learning;Query Answering", "primary_area": "", "supplementary_material": "", "author": "Rajarshi Das;Shehzaad Dhuliawala;Manzil Zaheer;Luke Vilnis;Ishan Durugkar;Akshay Krishnamurthy;Alex Smola;Andrew McCallum", "authorids": "rajarshi@cs.umass.edu;sdhuliawala@cs.umass.edu;manzil@cmu.edu;luke@cs.umass.edu;ishand@cs.utexas.edu;akshay@cs.umass.edu;alex@smola.org;mccallum@cs.umass.edu", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\ndas2018go,\ntitle={Go for a Walk and Arrive at the Answer: Reasoning Over Paths in Knowledge Bases using Reinforcement Learning},\nauthor={Rajarshi Das and Shehzaad Dhuliawala and Manzil Zaheer and Luke Vilnis and Ishan Durugkar and Akshay Krishnamurthy and Alex Smola and Andrew McCallum},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Syg-YfWCW},\n}", "github": "[![github](/images/github_icon.svg) shehzaadzd/MINERVA](https://github.com/shehzaadzd/MINERVA) + [![Papers with Code](/images/pwc_icon.svg) 6 community implementations](https://paperswithcode.com/paper/?openreview=Syg-YfWCW)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 28, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 701, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4820794446342808007&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=Syg-YfWCW", "pdf": "https://openreview.net/pdf?id=Syg-YfWCW", "email": ";;;;;;;", "author_num": 8 }, { "title": "Semi-parametric topological memory for navigation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/61", "id": "SygwwGbRW", "author_site": "Nikolay Savinov, Alexey Dosovitskiy, Vladlen Koltun", "tldr": "We introduce a new memory architecture for navigation in previously unseen environments, inspired by landmark-based navigation in animals.", "abstract": "We introduce a new memory architecture for navigation in previously unseen environments, inspired by landmark-based navigation in animals. The proposed semi-parametric topological memory (SPTM) consists of a (non-parametric) graph with nodes corresponding to locations in the environment and a (parametric) deep network capable of retrieving nodes from the graph based on observations. The graph stores no metric information, only connectivity of locations corresponding to the nodes. We use SPTM as a planning module in a navigation system. Given only 5 minutes of footage of a previously unseen maze, an SPTM-based navigation agent can build a topological map of the environment and use it to confidently navigate towards goals. The average success rate of the SPTM agent in goal-directed navigation across test environments is higher than the best-performing baseline by a factor of three.", "keywords": "deep learning;navigation;memory", "primary_area": "", "supplementary_material": "", "author": "Nikolay Savinov;Alexey Dosovitskiy;Vladlen Koltun", "authorids": "nikolay.savinov@inf.ethz.ch;adosovitskiy@gmail.com;vkoltun@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nsavinov2018semiparametric,\ntitle={Semi-parametric topological memory for navigation},\nauthor={Nikolay Savinov and Alexey Dosovitskiy and Vladlen Koltun},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SygwwGbRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "3;7;7", "confidence": "4;5;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 15, "authors#_avg": 3, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 450, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10804671962093103166&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=SygwwGbRW", "pdf": "https://openreview.net/pdf?id=SygwwGbRW", "email": ";;", "author_num": 3 }, { "id": "SyhRVm-Rb", "title": "Automatic Goal Generation for Reinforcement Learning Agents", "track": "main", "status": "Reject", "tldr": "We efficiently solve multi-task problems with an automatic curriculum generation algorithm based on a generative model that tracks the learning agent's performance.", "abstract": "Reinforcement learning (RL) is a powerful technique to train an agent to perform a task. However, an agent that is trained using RL is only capable of achieving the single task that is specified via its reward function. Such an approach does not scale well to settings in which an agent needs to perform a diverse set of tasks, such as navigating to varying positions in a room or moving objects to varying locations. Instead, we propose a method that allows an agent to automatically discover the range of tasks that it is capable of performing in its environment. We use a generator network to propose tasks for the agent to try to achieve, each task being specified as reaching a certain parametrized subset of the state-space. The generator network is optimized using adversarial training to produce tasks that are always at the appropriate level of difficulty for the agent. Our method thus automatically produces a curriculum of tasks for the agent to learn. We show that, by using this framework, an agent can efficiently and automatically learn to perform a wide set of tasks without requiring any prior knowledge of its environment (Videos and code available at: https://sites.google.com/view/goalgeneration4rl). Our method can also learn to achieve tasks with sparse rewards, which pose significant challenges for traditional RL methods.", "keywords": "Reinforcement Learning;Multi-task Learning;Curriculum Learning", "primary_area": "", "supplementary_material": "", "author": "David Held;Xinyang Geng;Carlos Florensa;Pieter Abbeel", "authorids": "dheld@andrew.cmu.edu;young.geng@berkeley.edu;florensa@berkeley.edu;pabbeel@berkeley.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nheld2018automatic,\ntitle={Automatic Goal Generation for Reinforcement Learning Agents},\nauthor={David Held and Xinyang Geng and Carlos Florensa and Pieter Abbeel},\nyear={2018},\nurl={https://openreview.net/forum?id=SyhRVm-Rb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SyhRVm-Rb", "pdf_size": 0, "rating": "4;6;8", "confidence": "4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 23, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 527, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5836114268256047177&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "SyhcXjy0Z", "title": "APPLICATION OF DEEP CONVOLUTIONAL NEURAL NETWORK TO PREVENT ATM FRAUD BY FACIAL DISGUISE IDENTIFICATION", "track": "main", "status": "Reject", "tldr": "Proposed System can prevent impersonators with facial disguises from completing a fraudulent transaction using a pre-trained DCNN.", "abstract": "The paper proposes and demonstrates a Deep Convolutional Neural Network (DCNN) architecture to identify users with disguised face attempting a fraudulent ATM transaction. The recent introduction of Disguised Face Identification (DFI) framework proves the applicability of deep neural networks for this very problem. All the ATMs nowadays incorporate a hidden camera in them and capture the footage of their users. However, it is impossible for the police to track down the impersonators with disguised faces from the ATM footage. The proposed deep convolutional neural network is trained to identify, in real time, whether the user in the captured image is trying to cloak his identity or not. The output of the DCNN is then reported to the ATM to take appropriate steps and prevent the swindler from completing the transaction. The network is trained using a dataset of images captured in similar situations as of an ATM. The comparatively low background clutter in the images enables the network to demonstrate high accuracy in feature extraction and classification for all the different disguises.", "keywords": "Deep Convolutional Neural Network;Disguised Face Identification;Fraudulent Transaction;ATM;Impersonation;", "primary_area": "", "supplementary_material": "", "author": "Suraj Nandkishor Kothawade;Sumit Baburao Tamgale", "authorids": "kothawadesuraj@sggs.ac.in;tamgalesumit@sggs.ac.in", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nnandkishor2018application,\ntitle={{APPLICATION} {OF} {DEEP} {CONVOLUTIONAL} {NEURAL} {NETWORK} {TO} {PREVENT} {ATM} {FRAUD} {BY} {FACIAL} {DISGUISE} {IDENTIFICATION}},\nauthor={Suraj Nandkishor Kothawade and Sumit Baburao Tamgale},\nyear={2018},\nurl={https://openreview.net/forum?id=SyhcXjy0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SyhcXjy0Z", "pdf_size": 0, "rating": "1;2;3", "confidence": "5;4;5", "rating_avg": 2.0, "confidence_avg": 4.666666666666667, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4191132110041537427&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "PixelNN: Example-based Image Synthesis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/209", "id": "Syhr6pxCW", "author_site": "Aayush Bansal, Yaser Sheikh, Deva Ramanan", "tldr": "Pixel-wise nearest neighbors used for generating multiple images from incomplete priors such as a low-res images, surface normals, edges etc.", "abstract": "We present a simple nearest-neighbor (NN) approach that synthesizes high-frequency photorealistic images from an ``incomplete'' signal such as a low-resolution image, a surface normal map, or edges. Current state-of-the-art deep generative models designed for such conditional image synthesis lack two important things: (1) they are unable to generate a large set of diverse outputs, due to the mode collapse problem. (2) they are not interpretable, making it difficult to control the synthesized output. We demonstrate that NN approaches potentially address such limitations, but suffer in accuracy on small datasets. We design a simple pipeline that combines the best of both worlds: the first stage uses a convolutional neural network (CNN) to map the input to a (overly-smoothed) image, and the second stage uses a pixel-wise nearest neighbor method to map the smoothed output to multiple high-quality, high-frequency outputs in a controllable manner. Importantly, pixel-wise matching allows our method to compose novel high-frequency content by cutting-and-pasting pixels from different training exemplars. We demonstrate our approach for various input modalities, and for various domains ranging from human faces, pets, shoes, and handbags.", "keywords": "conditional image synthesis;nearest neighbors", "primary_area": "", "supplementary_material": "", "author": "Aayush Bansal;Yaser Sheikh;Deva Ramanan", "authorids": "aayushb@cs.cmu.edu;yaser@cs.cmu.edu;deva@cs.cmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nbansal2018pixelnn,\ntitle={Pixel{NN}: Example-based Image Synthesis},\nauthor={Aayush Bansal and Yaser Sheikh and Deva Ramanan},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=Syhr6pxCW},\n}", "github": "[![github](/images/github_icon.svg) aayushbansal/PixelNN-Code](https://github.com/aayushbansal/PixelNN-Code)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;3;4", "rating_avg": 7.0, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 255, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16832087782645647806&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=Syhr6pxCW", "pdf": "https://openreview.net/pdf?id=Syhr6pxCW", "email": ";;", "author_num": 3 }, { "id": "Syjha0gAZ", "title": "Loss Functions for Multiset Prediction", "track": "main", "status": "Reject", "tldr": "We study the problem of multiset prediction and propose a novel multiset loss function, providing analysis and empirical evidence that demonstrates its effectiveness.", "abstract": "We study the problem of multiset prediction. The goal of multiset prediction is to train a predictor that maps an input to a multiset consisting of multiple items. Unlike existing problems in supervised learning, such as classification, ranking and sequence generation, there is no known order among items in a target multiset, and each item in the multiset may appear more than once, making this problem extremely challenging. In this paper, we propose a novel multiset loss function by viewing this problem from the perspective of sequential decision making. The proposed multiset loss function is empirically evaluated on two families of datasets, one synthetic and the other real, with varying levels of difficulty, against various baseline loss functions including reinforcement learning, sequence, and aggregated distribution matching loss functions. The experiments reveal the effectiveness of the proposed loss function over the others.", "keywords": "machine learning;deep learning;structured prediction;sequential prediction", "primary_area": "", "supplementary_material": "", "author": "Sean Welleck;Zixin Yao;Yu Gai;Jialin Mao;Zheng Zhang;Kyunghyun Cho", "authorids": "wellecks@nyu.edu;zy566@nyu.edu;yg1246@nyu.edu;jm5830@nyu.edu;zz@nyu.edu;kyunghyun.cho@nyu.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nwelleck2018loss,\ntitle={Loss Functions for Multiset Prediction},\nauthor={Sean Welleck and Zixin Yao and Yu Gai and Jialin Mao and Zheng Zhang and Kyunghyun Cho},\nyear={2018},\nurl={https://openreview.net/forum?id=Syjha0gAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Syjha0gAZ", "pdf_size": 0, "rating": "4;5;7", "confidence": "3;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 7, "authors#_avg": 6, "corr_rating_confidence": -0.18898223650461363, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=179679515432814537&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "SyjjD1WRb", "title": "Evolutionary Expectation Maximization for Generative Models with Binary Latents", "track": "main", "status": "Reject", "tldr": "We present Evolutionary EM as a novel algorithm for unsupervised training of generative models with binary latent variables that intimately connects variational EM with evolutionary optimization", "abstract": "We establish a theoretical link between evolutionary algorithms and variational parameter optimization of probabilistic generative models with binary hidden variables.\nWhile the novel approach is independent of the actual generative model, here we use two such models to investigate its applicability and scalability: a noisy-OR Bayes Net (as a standard example of binary data) and Binary Sparse Coding (as a model for continuous data).\n\nLearning of probabilistic generative models is first formulated as approximate maximum likelihood optimization using variational expectation maximization (EM).\nWe choose truncated posteriors as variational distributions in which discrete latent states serve as variational parameters. In the variational E-step,\nthe latent states are then \noptimized according to a tractable free-energy objective. Given a data point, we can show that evolutionary algorithms can be used for the variational optimization loop by (A)~considering the bit-vectors of the latent states as genomes of individuals, and by (B)~defining the fitness of the\nindividuals as the (log) joint probabilities given by the used generative model.\n\nAs a proof of concept, we apply the novel evolutionary EM approach to the optimization of the parameters of noisy-OR Bayes nets and binary sparse coding on artificial and real data (natural image patches). Using point mutations and single-point cross-over for the evolutionary algorithm, we find that scalable variational EM algorithms are obtained which efficiently improve the data likelihood. In general we believe that, with the link established here, standard as well as recent results in the field of evolutionary optimization can be leveraged to address the difficult problem of parameter optimization in generative models.", "keywords": "unsupervised;learning;evolutionary;sparse;coding;noisyOR;BSC;EM;expectation-maximization;variational EM;optimization", "primary_area": "", "supplementary_material": "", "author": "Enrico Guiraud;Jakob Drefs;Joerg Luecke", "authorids": "enrico.guiraud@cern.ch;jakob.heinrich.drefs@uni-oldenburg.de;joerg.luecke@uni-oldenburg.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nguiraud2018evolutionary,\ntitle={Evolutionary Expectation Maximization for Generative Models with Binary Latents},\nauthor={Enrico Guiraud and Jakob Drefs and Joerg Luecke},\nyear={2018},\nurl={https://openreview.net/forum?id=SyjjD1WRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SyjjD1WRb", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:fPKtDhr7pEMJ:scholar.google.com/&scioq=Evolutionary+Expectation+Maximization+for+Generative+Models+with+Binary+Latents&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SyjsLqxR-", "title": "Universality, Robustness, and Detectability of Adversarial Perturbations under Adversarial Training", "track": "main", "status": "Reject", "tldr": "We empirically show that adversarial training is effective for removing universal perturbations, makes adversarial examples less robust to image transformations, and leaves them detectable for a detection approach.", "abstract": "Classifiers such as deep neural networks have been shown to be vulnerable against adversarial perturbations on problems with high-dimensional input space. While adversarial training improves the robustness of classifiers against such adversarial perturbations, it leaves classifiers sensitive to them on a non-negligible fraction of the inputs. We argue that there are two different kinds of adversarial perturbations: shared perturbations which fool a classifier on many inputs and singular perturbations which only fool the classifier on a small fraction of the data. We find that adversarial training increases the robustness of classifiers against shared perturbations. Moreover, it is particularly effective in removing universal perturbations, which can be seen as an extreme form of shared perturbations. Unfortunately, adversarial training does not consistently increase the robustness against singular perturbations on unseen inputs. However, we find that adversarial training decreases robustness of the remaining perturbations against image transformations such as changes to contrast and brightness or Gaussian blurring. It thus makes successful attacks on the classifier in the physical world less likely. Finally, we show that even singular perturbations can be easily detected and must thus exhibit generalizable patterns even though the perturbations are specific for certain inputs. ", "keywords": "adversarial examples;adversarial training;universal perturbations;safety;deep learning", "primary_area": "", "supplementary_material": "", "author": "Jan Hendrik Metzen", "authorids": "janhendrik.metzen@de.bosch.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nhendrik2018universality,\ntitle={Universality, Robustness, and Detectability of Adversarial Perturbations under Adversarial Training},\nauthor={Jan Hendrik Metzen},\nyear={2018},\nurl={https://openreview.net/forum?id=SyjsLqxR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SyjsLqxR-", "pdf_size": 0, "rating": "3;6;6", "confidence": "4;3;3", "rating_avg": 5.0, "confidence_avg": 3.3333333333333335, "replies_avg": 9, "authors#_avg": 1, "corr_rating_confidence": -1.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3815965138552151092&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "Syl3_2JCZ", "title": "A Self-Organizing Memory Network", "track": "main", "status": "Reject", "tldr": "We derived biologically plausible synaptic plasticity learning rules for a recurrent neural network to store stimulus representations. ", "abstract": "Working memory requires information about external stimuli to be represented in the brain even after those stimuli go away. This information is encoded in the activities of neurons, and neural activities change over timescales of tens of milliseconds. Information in working memory, however, is retained for tens of seconds, suggesting the question of how time-varying neural activities maintain stable representations. Prior work shows that, if the neural dynamics are in the ` null space' of the representation - so that changes to neural activity do not affect the downstream read-out of stimulus information - then information can be retained for periods much longer than the time-scale of individual-neuronal activities. The prior work, however, requires precisely constructed synaptic connectivity matrices, without explaining how this would arise in a biological neural network. To identify mechanisms through which biological networks can self-organize to learn memory function, we derived biologically plausible synaptic plasticity rules that dynamically modify the connectivity matrix to enable information storing. Networks implementing this plasticity rule can successfully learn to form memory representations even if only 10% of the synapses are plastic, they are robust to synaptic noise, and they can represent information about multiple stimuli. ", "keywords": "Working Memory;Learning Rules;Stimulus Representations", "primary_area": "", "supplementary_material": "", "author": "Callie Federer;Joel Zylberberg", "authorids": "callie.federer@ucdenver.edu;joel.zylberberg@ucdenver.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nfederer2018a,\ntitle={A Self-Organizing Memory Network},\nauthor={Callie Federer and Joel Zylberberg},\nyear={2018},\nurl={https://openreview.net/forum?id=Syl3_2JCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=Syl3_2JCZ", "pdf_size": 0, "rating": "3;4;4", "confidence": "2;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 3.3333333333333335, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": 1.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "SylJ1D1C-", "title": "PDE-Net: Learning PDEs from Data", "track": "main", "status": "Workshop", "tldr": "This paper proposes a new feed-forward network, call PDE-Net, to learn PDEs from data. ", "abstract": "Partial differential equations (PDEs) play a prominent role in many disciplines such as applied mathematics, physics, chemistry, material science, computer science, etc. PDEs are commonly derived based on physical laws or empirical observations. However, the governing equations for many complex systems in modern applications are still not fully known. With the rapid development of sensors, computational power, and data storage in the past decade, huge quantities of data can be easily collected and efficiently stored. Such vast quantity of data offers new opportunities for data-driven discovery of hidden physical laws. Inspired by the latest development of neural network designs in deep learning, we propose a new feed-forward deep network, called PDE-Net, to fulfill two objectives at the same time: to accurately predict dynamics of complex systems and to uncover the underlying hidden PDE models. The basic idea of the proposed PDE-Net is to learn differential operators by learning convolution kernels (filters), and apply neural networks or other machine learning methods to approximate the unknown nonlinear responses. Comparing with existing approaches, which either assume the form of the nonlinear response is known or fix certain finite difference approximations of differential operators, our approach has the most flexibility by learning both differential operators and the nonlinear responses. A special feature of the proposed PDE-Net is that all filters are properly constrained, which enables us to easily identify the governing PDE models while still maintaining the expressive and predictive power of the network. These constrains are carefully designed by fully exploiting the relation between the orders of differential operators and the orders of sum rules of filters (an important concept originated from wavelet theory). We also discuss relations of the PDE-Net with some existing networks in computer vision such as Network-In-Network (NIN) and Residual Neural Network (ResNet). Numerical experiments show that the PDE-Net has the potential to uncover the hidden PDE of the observed dynamics, and predict the dynamical behavior for a relatively long time, even in a noisy environment.", "keywords": "deep convolution network;partial differential equation;physical laws", "primary_area": "", "supplementary_material": "", "author": "Zichao Long;Yiping Lu;Xianzhong Ma;Bin Dong", "authorids": "zlong@pku.edu.cn;luyiping9712@pku.edu.cn;xianzhongma@pku.edu.cn;dongbin@math.pku.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlong2018pdenet,\ntitle={{PDE}-Net: Learning {PDE}s from Data},\nauthor={Zichao Long and Yiping Lu and Xianzhong Ma and Bin Dong},\nyear={2018},\nurl={https://openreview.net/forum?id=SylJ1D1C-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SylJ1D1C-", "pdf_size": 0, "rating": "5;7;8", "confidence": "4;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 980, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12110088803814108713&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13 }, { "id": "Symlf87uG", "title": "Semi-supervised Regression with Generative Adversarial Networks for End to End Learning in Autonomous Driving", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "This research concerns solving the semi-supervised learning problem with generative adversarial networks for regression. In contrast to classification, where only a limited number of distinct classes is given, the regression task is defined as predicting continuous labels for a given dataset. Semi-supervised learning is of vital importance for the applications where a small number of labeled samples is available, or labeling samples is difficult or expensive to collect. A case in point is autonomous driving in which obtaining sufficient labeled samples covering all driving conditions is costly. In this context, we can take advantage of semi-supervised learning techniques with groundbreaking generative models, such as generative adversarial networks. However, almost all proposed GAN-based semi-supervised techniques in the literature are focused on solving the classification problem. Hence, developing a GAN-based semi-supervised method for the regression task is still an open problem. In this work, two different architectures will be proposed to address this problem. In summary, our introduced method is able to predict continuous labels for a training dataset which has only a limited number of labeled samples. Moreover, the application of this technique for solving the end-to-end task in autonomous driving will be presented. \nWe performed several experiments over a publicly available driving dataset to evaluate our proposed method, and the results are very promising. The results show that our approach generates images with high quality, gives smaller label prediction error and leads to a more stable training compared with the state-of-the-art Improved GAN technique~\\citep{ImprovedGAN2016}.", "keywords": "GAN;Regression;SSL;Autonomous Driving", "primary_area": "", "supplementary_material": "", "author": "Mehdi Rezagholizadeh;Md Akmal Haidar", "authorids": "mehdi.rezagholizadeh@gmail.com;md.akmal.haidar@huawei.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nrezagholizadeh2018semisupervised,\ntitle={Semi-supervised Regression with Generative Adversarial Networks for End to End Learning in Autonomous Driving},\nauthor={Mehdi Rezagholizadeh and Md Akmal Haidar},\nyear={2018},\nurl={https://openreview.net/forum?id=SJ2ZjsyC-},\n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=Symlf87uG", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 0, "authors#_avg": 2, "corr_rating_confidence": 0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7547862446525939110&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Reinforcement Learning Algorithm Selection", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/295", "id": "SyoDInJ0-", "author_site": "Romain Laroche, Rapha\u00ebl F\u00e9raud", "tldr": "This paper formalises the problem of online algorithm selection in the context of Reinforcement Learning.", "abstract": "This paper formalises the problem of online algorithm selection in the context of Reinforcement Learning (RL). The setup is as follows: given an episodic task and a finite number of off-policy RL algorithms, a meta-algorithm has to decide which RL algorithm is in control during the next episode so as to maximize the expected return. The article presents a novel meta-algorithm, called Epochal Stochastic Bandit Algorithm Selection (ESBAS). Its principle is to freeze the policy updates at each epoch, and to leave a rebooted stochastic bandit in charge of the algorithm selection. Under some assumptions, a thorough theoretical analysis demonstrates its near-optimality considering the structural sampling budget limitations. ESBAS is first empirically evaluated on a dialogue task where it is shown to outperform each individual algorithm in most configurations. ESBAS is then adapted to a true online setting where algorithms update their policies after each transition, which we call SSBAS. SSBAS is evaluated on a fruit collection task where it is shown to adapt the stepsize parameter more efficiently than the classical hyperbolic decay, and on an Atari game, where it improves the performance by a wide margin.", "keywords": "Reinforcement Learning;Multi-Armed Bandit;Algorithm Selection", "primary_area": "", "supplementary_material": "", "author": "Romain Laroche;Raphael Feraud", "authorids": "romain.laroche@gmail.com;raphael.feraud@orange.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nlaroche2018reinforcement,\ntitle={Reinforcement Learning Algorithm Selection},\nauthor={Romain Laroche and Raphael Feraud},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SyoDInJ0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;5;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12019499194605958978&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=SyoDInJ0-", "pdf": "https://openreview.net/pdf?id=SyoDInJ0-", "email": ";", "author_num": 2 }, { "id": "SyqAPeWAZ", "title": "CNNs as Inverse Problem Solvers and Double Network Superresolution", "track": "main", "status": "Reject", "tldr": "After proving that a neuron acts as an inverse problem solver for superresolution and a network of neurons is guarantied to provide a solution, we proposed a double network architecture that performs faster than state-of-the-art.", "abstract": "In recent years Convolutional Neural Networks (CNN) have been used extensively for Superresolution (SR). In this paper, we use inverse problem and sparse representation solutions to form a mathematical basis for CNN operations. We show how a single neuron is able to provide the optimum solution for inverse problem, given a low resolution image dictionary as an operator. Introducing a new concept called Representation Dictionary Duality, we show that CNN elements (filters) are trained to be representation vectors and then, during reconstruction, used as dictionaries. In the light of theoretical work, we propose a new algorithm which uses two networks with different structures that are separately trained with low and high coherency image patches and show that it performs faster compared to the state-of-the-art algorithms while not sacrificing from performance.", "keywords": "superresolution;convolutional neural network;sparse representation;inverse problem", "primary_area": "", "supplementary_material": "", "author": "Cem TARHAN;G\u00f6zde BOZDA\u011eI AKAR", "authorids": "cemtarhan@aselsan.com.tr;bozdagi@metu.edu.tr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntarhan2018cnns,\ntitle={{CNN}s as Inverse Problem Solvers and Double Network Superresolution},\nauthor={Cem TARHAN and G\u00f6zde BOZDA\u011eI AKAR},\nyear={2018},\nurl={https://openreview.net/forum?id=SyqAPeWAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SyqAPeWAZ", "pdf_size": 0, "rating": "3;4;6", "confidence": "5;4;2", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": -1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GG4s4TEQH-QJ:scholar.google.com/&scioq=CNNs+as+Inverse+Problem+Solvers+and+Double+Network+Superresolution&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Syntax-Directed Variational Autoencoder for Structured Data", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/32", "id": "SyqShMZRb", "author_site": "Hanjun Dai, Yingtao Tian, Bo Dai, Steven Skiena, Le Song", "tldr": "A new generative model for discrete structured data. The proposed stochastic lazy attribute converts the offline semantic check into online guidance for stochastic decoding, which effectively addresses the constraints in syntax and semantics, and also achieves superior performance", "abstract": "Deep generative models have been enjoying success in modeling continuous data. However it remains challenging to capture the representations for discrete structures with formal grammars and semantics, e.g., computer programs and molecular structures. How to generate both syntactically and semantically correct data still remains largely an open problem. Inspired by the theory of compiler where syntax and semantics check is done via syntax-directed translation (SDT), we propose a novel syntax-directed variational autoencoder (SD-VAE) by introducing stochastic lazy attributes. This approach converts the offline SDT check into on-the-fly generated guidance for constraining the decoder. Comparing to the state-of-the-art methods, our approach enforces constraints on the output space so that the output will be not only syntactically valid, but also semantically reasonable. We evaluate the proposed model with applications in programming language and molecules, including reconstruction and program/molecule optimization. The results demonstrate the effectiveness in incorporating syntactic and semantic constraints in discrete generative models, which is significantly better than current state-of-the-art approaches.", "keywords": "generative model for structured data;syntax-directed generation;molecule and program optimization;variational autoencoder", "primary_area": "", "supplementary_material": "", "author": "Hanjun Dai;Yingtao Tian;Bo Dai;Steven Skiena;Le Song", "authorids": "hanjundai@gatech.edu;yittian@cs.stonybrook.edu;bohr.dai@gmail.com;skiena@cs.stonybrook.edu;lsong@cc.gatech.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ndai2018syntaxdirected,\ntitle={Syntax-Directed Variational Autoencoder for Structured Data},\nauthor={Hanjun Dai and Yingtao Tian and Bo Dai and Steven Skiena and Le Song},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SyqShMZRb},\n}", "github": "[![github](/images/github_icon.svg) Hanjun-Dai/sdvae](https://github.com/Hanjun-Dai/sdvae)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "3;5;7", "confidence": "2;1;3", "rating_avg": 5.0, "confidence_avg": 2.0, "replies_avg": 15, "authors#_avg": 5, "corr_rating_confidence": 0.5, "gs_citation": 446, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7991796845235005593&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=SyqShMZRb", "pdf": "https://openreview.net/pdf?id=SyqShMZRb", "email": ";;;;", "author_num": 5 }, { "id": "Syr8Qc1CW", "title": "DNA-GAN: Learning Disentangled Representations from Multi-Attribute Images", "track": "main", "status": "Workshop", "tldr": "We proposed a supervised algorithm, DNA-GAN, to disentangle multiple attributes of images.", "abstract": "Disentangling factors of variation has always been a challenging problem in representation learning. Existing algorithms suffer from many limitations, such as unpredictable disentangling factors, bad quality of generated images from encodings, lack of identity information, etc. In this paper, we proposed a supervised algorithm called DNA-GAN trying to disentangle different attributes of images. The latent representations of images are DNA-like, in which each individual piece represents an independent factor of variation. By annihilating the recessive piece and swapping a certain piece of two latent representations, we obtain another two different representations which could be decoded into images. In order to obtain realistic images and also disentangled representations, we introduced the discriminator for adversarial training. Experiments on Multi-PIE and CelebA datasets demonstrate the effectiveness of our method and the advantage of overcoming limitations existing in other methods.", "keywords": "disentangled representations;multi-attribute images;generative adversarial networks", "primary_area": "", "supplementary_material": "", "author": "Taihong Xiao;Jiapeng Hong;Jinwen Ma", "authorids": "xiaotaihong@pku.edu.cn;jphong@pku.edu.cn;jwma@math.pku.edu.cn", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nxiao2018dnagan,\ntitle={{DNA}-{GAN}: Learning Disentangled Representations from Multi-Attribute Images},\nauthor={Taihong Xiao and Jiapeng Hong and Jinwen Ma},\nyear={2018},\nurl={https://openreview.net/forum?id=Syr8Qc1CW},\n}", "github": "[![github](/images/github_icon.svg) Prinsphield/DNA-GAN](https://github.com/Prinsphield/DNA-GAN)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Syr8Qc1CW", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;5", "rating_avg": 5.0, "confidence_avg": 4.333333333333333, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 95, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10316950315000655632&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "SyrGJYlRZ", "title": "YellowFin and the Art of Momentum Tuning", "track": "main", "status": "Reject", "tldr": "YellowFin is an SGD based optimizer with both momentum and learning rate adaptivity.", "abstract": "Hyperparameter tuning is one of the most time-consuming workloads in deep learning. State-of-the-art optimizers, such as AdaGrad, RMSProp and Adam, reduce this labor by adaptively tuning an individual learning rate for each variable. Recently researchers have shown renewed interest in simpler methods like momentum SGD as they may yield better results. Motivated by this trend, we ask: can simple adaptive methods, based on SGD perform as well or better? We revisit the momentum SGD algorithm and show that hand-tuning a single learning rate and momentum makes it competitive with Adam. We then analyze its robustness to learning rate misspecification and objective curvature variation. Based on these insights, we design YellowFin, an automatic tuner for momentum and learning rate in SGD. YellowFin optionally uses a negative-feedback loop to compensate for the momentum dynamics in asynchronous settings on the fly. We empirically show YellowFin can converge in fewer iterations than Adam on ResNets and LSTMs for image recognition, language modeling and constituency parsing, with a speedup of up to $3.28$x in synchronous and up to $2.69$x in asynchronous settings.", "keywords": "adaptive optimizer;momentum;hyperparameter tuning", "primary_area": "", "supplementary_material": "", "author": "Jian Zhang;Ioannis Mitliagkas;Christopher Re", "authorids": "zjian@cs.stanford.edu;ioannis@iro.umontreal.ca;chrismre@cs.stanford.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzhang2018yellowfin,\ntitle={YellowFin and the Art of Momentum Tuning},\nauthor={Jian Zhang and Ioannis Mitliagkas and Christopher Re},\nyear={2018},\nurl={https://openreview.net/forum?id=SyrGJYlRZ},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=SyrGJYlRZ)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SyrGJYlRZ", "pdf_size": 0, "rating": "4;4;6", "confidence": "3;5;1", "rating_avg": 4.666666666666667, "confidence_avg": 3.0, "replies_avg": 15, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 103, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12061860214243684143&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "title": "Critical Points of Linear Neural Networks: Analytical Forms and Landscape Properties", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/169", "id": "SysEexbRb", "author_site": "Yi Zhou, Yingbin Liang", "tldr": "We provide necessary and sufficient analytical forms for the critical points of the square loss functions for various neural networks, and exploit the analytical forms to characterize the landscape properties for the loss functions of these neural networks.", "abstract": "Due to the success of deep learning to solving a variety of challenging machine learning tasks, there is a rising interest in understanding loss functions for training neural networks from a theoretical aspect. Particularly, the properties of critical points and the landscape around them are of importance to determine the convergence performance of optimization algorithms. In this paper, we provide a necessary and sufficient characterization of the analytical forms for the critical points (as well as global minimizers) of the square loss functions for linear neural networks. We show that the analytical forms of the critical points characterize the values of the corresponding loss functions as well as the necessary and sufficient conditions to achieve global minimum. Furthermore, we exploit the analytical forms of the critical points to characterize the landscape properties for the loss functions of linear neural networks and shallow ReLU networks. One particular conclusion is that: While the loss function of linear networks has no spurious local minimum, the loss function of one-hidden-layer nonlinear networks with ReLU activation function does have local minimum that is not global minimum.", "keywords": "neural networks;critical points;analytical form;landscape", "primary_area": "", "supplementary_material": "", "author": "Yi Zhou;Yingbin Liang", "authorids": "zhou.1172@osu.edu;liang.889@osu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nzhou2018critical,\ntitle={Critical Points of Linear Neural Networks: Analytical Forms and Landscape Properties},\nauthor={Yi Zhou and Yingbin Liang},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SysEexbRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;3;5", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 109, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9564616842927093051&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=SysEexbRb", "pdf": "https://openreview.net/pdf?id=SysEexbRb", "email": ";", "author_num": 2 }, { "id": "Syt0r4bRZ", "title": "Tree2Tree Learning with Memory Unit", "track": "main", "status": "Reject", "tldr": "", "abstract": "Traditional recurrent neural network (RNN) or convolutional neural net- work (CNN) based sequence-to-sequence model can not handle tree structural data well. To alleviate this problem, in this paper, we propose a tree-to-tree model with specially designed encoder unit and decoder unit, which recursively encodes tree inputs into highly folded tree embeddings and decodes the embeddings into tree outputs. Our model could represent the complex information of a tree while also restore a tree from embeddings.\nWe evaluate our model in random tree recovery task and neural machine translation task. Experiments show that our model outperforms the baseline model.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ning Miao;Hengliang Wang;Ran Le;Chongyang Tao;Mingyue Shang;Rui Yan;Dongyan Zhao", "authorids": "miaoning@pku.edu.cn;wanghl@pku.edu.cn;leran@buaa.edu.cn;chongyangtao@pku.edu.cn;shangmy@pku.edu.cn;ruiyan@pku.edu.cn;zhaody@pku.edu.cn", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nmiao2018treetree,\ntitle={Tree2Tree Learning with Memory Unit},\nauthor={Ning Miao and Hengliang Wang and Ran Le and Chongyang Tao and Mingyue Shang and Rui Yan and Dongyan Zhao},\nyear={2018},\nurl={https://openreview.net/forum?id=Syt0r4bRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Syt0r4bRZ", "pdf_size": 0, "rating": "2;4;5", "confidence": "4;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9qvNaHSmP34J:scholar.google.com/&scioq=Tree2Tree+Learning+with+Memory+Unit&hl=en&as_sdt=0,14", "gs_version_total": 0 }, { "id": "SyuWNMZ0W", "title": "Directing Generative Networks with Weighted Maximum Mean Discrepancy", "track": "main", "status": "Reject", "tldr": "We propose an estimator for the maximum mean discrepancy, appropriate when a target distribution is only accessible via a biased sample selection procedure, and show that it can be used in a generative network to correct for this bias.", "abstract": "The maximum mean discrepancy (MMD) between two probability measures P\nand Q is a metric that is zero if and only if all moments of the two measures\nare equal, making it an appealing statistic for two-sample tests. Given i.i.d. samples\nfrom P and Q, Gretton et al. (2012) show that we can construct an unbiased\nestimator for the square of the MMD between the two distributions. If P is a\ndistribution of interest and Q is the distribution implied by a generative neural\nnetwork with stochastic inputs, we can use this estimator to train our neural network.\nHowever, in practice we do not always have i.i.d. samples from our target\nof interest. Data sets often exhibit biases\u2014for example, under-representation of\ncertain demographics\u2014and if we ignore this fact our machine learning algorithms\nwill propagate these biases. Alternatively, it may be useful to assume our data has\nbeen gathered via a biased sample selection mechanism in order to manipulate\nproperties of the estimating distribution Q.\nIn this paper, we construct an estimator for the MMD between P and Q when we\nonly have access to P via some biased sample selection mechanism, and suggest\nmethods for estimating this sample selection mechanism when it is not already\nknown. We show that this estimator can be used to train generative neural networks\non a biased data sample, to give a simulator that reverses the effect of that\nbias.", "keywords": "generative networks;two sample tests;bias correction;maximum mean discrepancy", "primary_area": "", "supplementary_material": "", "author": "Maurice Diesendruck;Guy W. Cole;Sinead Williamson", "authorids": "momod@utexas.edu;guywcole@utexas.edu;sinead.williamson@mccombs.utexas.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ndiesendruck2018directing,\ntitle={Directing Generative Networks with Weighted Maximum Mean Discrepancy},\nauthor={Maurice Diesendruck and Guy W. Cole and Sinead Williamson},\nyear={2018},\nurl={https://openreview.net/forum?id=SyuWNMZ0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SyuWNMZ0W", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;5;4", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10733797735097348485&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "SyunbfbAb", "title": "FigureQA: An Annotated Figure Dataset for Visual Reasoning", "track": "main", "status": "Workshop", "tldr": "We present a question-answering dataset, FigureQA, as a first step towards developing models that can intuitively recognize patterns from visual representations of data.", "abstract": "We introduce FigureQA, a visual reasoning corpus of over one million question-answer pairs grounded in over 100,000 images. The images are synthetic, scientific-style figures from five classes: line plots, dot-line plots, vertical and horizontal bar graphs, and pie charts. We formulate our reasoning task by generating questions from 15 templates; questions concern various relationships between plot elements and examine characteristics like the maximum, the minimum, area-under-the-curve, smoothness, and intersection. To resolve, such questions often require reference to multiple plot elements and synthesis of information distributed spatially throughout a figure. To facilitate the training of machine learning systems, the corpus also includes side data that can be used to formulate auxiliary objectives. In particular, we provide the numerical data used to generate each figure as well as bounding-box annotations for all plot elements. We study the proposed visual reasoning task by training several models, including the recently proposed Relation Network as strong baseline. Preliminary results indicate that the task poses a significant machine learning challenge. We envision FigureQA as a first step towards developing models that can intuitively recognize patterns from visual representations of data.", "keywords": "dataset;computer vision;deep learning;visual reasoning;relational reasoning", "primary_area": "", "supplementary_material": "", "author": "Samira Ebrahimi Kahou;Adam Atkinson;Vincent Michalski;\u00c1kos K\u00e1d\u00e1r;Adam Trischler;Yoshua Bengio", "authorids": "samira.ebrahimi@microsoft.com;adatkins@microsoft.com;vincent.michalski@umontreal.ca;kadar.akos@gmail.com;adam.trischler@microsoft.com;yoshua.bengio@umontreal.ca", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nebrahimi2018figureqa,\ntitle={Figure{QA}: An Annotated Figure Dataset for Visual Reasoning},\nauthor={Samira Ebrahimi Kahou and Adam Atkinson and Vincent Michalski and \u00c1kos K\u00e1d\u00e1r and Adam Trischler and Yoshua Bengio},\nyear={2018},\nurl={https://openreview.net/forum?id=SyunbfbAb},\n}", "github": "[![github](/images/github_icon.svg) vmichals/FigureQA-baseline](https://github.com/vmichals/FigureQA-baseline)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SyunbfbAb", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;3;4", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 14, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 360, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7004678543332000257&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "SyvCD-b0W", "title": "Autostacker: an Automatic Evolutionary Hierarchical Machine Learning System", "track": "main", "status": "Reject", "tldr": "Automate machine learning system with efficient search algorithm and innovative structure to provide better model baselines.", "abstract": "This work provides an automatic machine learning (AutoML) modelling architecture called Autostacker. Autostacker improves the prediction accuracy of machine learning baselines by utilizing an innovative hierarchical stacking architecture and an efficient parameter search algorithm. Neither prior domain knowledge about the data nor feature preprocessing is needed. We significantly reduce the time of AutoML with a naturally inspired algorithm - Parallel Hill Climbing (PHC). By parallelizing PHC, Autostacker can provide candidate pipelines with sufficient prediction accuracy within a short amount of time. These pipelines can be used as is or as a starting point for human experts to build on. By focusing on the modelling process, Autostacker breaks the tradition of following fixed order pipelines by exploring not only single model pipeline but also innovative combinations and structures. As we will show in the experiment section, Autostacker achieves significantly better performance both in terms of test accuracy and time cost comparing with human initial trials and recent popular AutoML system.", "keywords": "Machine Learning;AutoML", "primary_area": "", "supplementary_material": "", "author": "Boyuan Chen;Warren Mo;Ishanu Chattopadhyay;Hod Lipson", "authorids": "boyuan.chen@columbia.edu;warrenmo@uchicago.edu;ishanu@uchicago.edu;hod.lipson@columbia.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nchen2018autostacker,\ntitle={Autostacker: an Automatic Evolutionary Hierarchical Machine Learning System},\nauthor={Boyuan Chen and Warren Mo and Ishanu Chattopadhyay and Hod Lipson},\nyear={2018},\nurl={https://openreview.net/forum?id=SyvCD-b0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SyvCD-b0W", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;5;5", "rating_avg": 3.6666666666666665, "confidence_avg": 4.666666666666667, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9269790466687275075&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SywMS6ZfM", "title": "Distributional Inclusion Vector Embedding for Unsupervised Hypernymy Detection", "track": "main", "status": "Withdraw", "tldr": "We propose a novel unsupervised word embedding which preserves the inclusion property in the context distribution and achieve state-of-the-art results on unsupervised hypernymy detection", "abstract": "Modeling hypernymy, such as poodle is-a dog, is an important generalization aid to many NLP tasks, such as entailment, relation extraction, and question answering. Supervised learning from labeled hypernym sources, such as WordNet, limit the coverage of these models, which can be addressed by learning hypernyms from unlabeled text. Existing unsupervised methods either do not scale to large vocabularies or yield unacceptably poor accuracy. This paper introduces {\\it distributional inclusion vector embedding (DIVE)}, a simple-to-implement unsupervised method of hypernym discovery via per-word non-negative vector embeddings which preserve the inclusion property of word contexts. In experimental evaluations more comprehensive than any previous literature of which we are aware---evaluating on 11 datasets using multiple existing as well as newly proposed scoring functions---we find that our method provides up to double the precision of previous unsupervised methods, and the highest average performance, using a much more compact word representation, and yielding many new state-of-the-art results. In addition, the meaning of each dimension in DIVE is interpretable, which leads to a novel approach on word sense disambiguation as another promising application of DIVE.", "keywords": "unsupervised word embedding;unsupervised hypernym detection;distributional inclusion hypothesis;non-negative matrix factorization;word sense disambiguation;hypernym scoring functions", "primary_area": "", "supplementary_material": "", "author": "Haw-Shiuan Chang;ZiYun Wang;Luke Vilnis;Andrew McCallum", "authorids": "hschang@cs.umass.edu;wang-zy14@mails.tsinghua.edu.cn;luke@cs.umass.edu;mccallum@cs.umass.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SywMS6ZfM", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;5;5", "rating_avg": 4.666666666666667, "confidence_avg": 5.0, "replies_avg": 6, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=633340607689358624&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "title": "Deep Learning and Quantum Entanglement: Fundamental Connections with Implications to Network Design", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/310", "id": "SywXXwJAb", "author_site": "Yoav Levine, David Yakira, Nadav Cohen, Amnon Shashua", "tldr": "Employing quantum entanglement measures for quantifying correlations in deep learning, and using the connection to fit the deep network's architecture to correlations in the data.", "abstract": "Formal understanding of the inductive bias behind deep convolutional networks, i.e. the relation between the network's architectural features and the functions it is able to model, is limited. In this work, we establish a fundamental connection between the fields of quantum physics and deep learning, and use it for obtaining novel theoretical observations regarding the inductive bias of convolutional networks. Specifically, we show a structural equivalence between the function realized by a convolutional arithmetic circuit (ConvAC) and a quantum many-body wave function, which facilitates the use of quantum entanglement measures as quantifiers of a deep network's expressive ability to model correlations. Furthermore, the construction of a deep ConvAC in terms of a quantum Tensor Network is enabled. This allows us to perform a graph-theoretic analysis of a convolutional network, tying its expressiveness to a min-cut in its underlying graph. We demonstrate a practical outcome in the form of a direct control over the inductive bias via the number of channels (width) of each layer. We empirically validate our findings on standard convolutional networks which involve ReLU activations and max pooling. The description of a deep convolutional network in well-defined graph-theoretic tools and the structural connection to quantum entanglement, are two interdisciplinary bridges that are brought forth by this work.", "keywords": "deep learning;quantum entanglement;quantum physics;many body physics;data correlations;inductive bias;tensor networks", "primary_area": "", "supplementary_material": "", "author": "Yoav Levine;David Yakira;Nadav Cohen;Amnon Shashua", "authorids": "yoavlevine@cs.huji.ac.il;davidyakira@cs.huji.ac.il;cohennadav@cs.huji.ac.il;shashua@cs.huji.ac.il", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nlevine2018deep,\ntitle={Deep Learning and Quantum Entanglement: Fundamental Connections with Implications to Network Design},\nauthor={Yoav Levine and David Yakira and Nadav Cohen and Amnon Shashua},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SywXXwJAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer5;AnonReviewer6;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7;8", "confidence": "2;4;3;5", "rating_avg": 6.75, "confidence_avg": 3.5, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": 0.674199862463242, "gs_citation": 147, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=239189200302689850&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=SywXXwJAb", "pdf": "https://openreview.net/pdf?id=SywXXwJAb", "email": ";;;", "author_num": 4 }, { "id": "Syx6bz-Ab", "title": "Seq2SQL: Generating Structured Queries From Natural Language Using Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "We introduce Seq2SQL, which translates questions to SQL queries using rewards from online query execution, and WikiSQL, a SQL table/question/query dataset orders of magnitude larger than existing datasets.", "abstract": "Relational databases store a significant amount of the worlds data. However, accessing this data currently requires users to understand a query language such as SQL. We propose Seq2SQL, a deep neural network for translating natural language questions to corresponding SQL queries. Our model uses rewards from in the loop query execution over the database to learn a policy to generate the query, which contains unordered parts that are less suitable for optimization via cross entropy loss. Moreover, Seq2SQL leverages the structure of SQL to prune the space of generated queries and significantly simplify the generation problem. In addition to the model, we release WikiSQL, a dataset of 80654 hand-annotated examples of questions and SQL queries distributed across 24241 tables fromWikipedia that is an order of magnitude larger than comparable datasets. By applying policy based reinforcement learning with a query execution environment to WikiSQL, Seq2SQL outperforms a state-of-the-art semantic parser, improving execution accuracy from 35.9% to 59.4% and logical form accuracy from 23.4% to 48.3%.", "keywords": "deep learning;reinforcement learning;dataset;natural language processing;natural language interface;sql", "primary_area": "", "supplementary_material": "", "author": "Victor Zhong;Caiming Xiong;Richard Socher", "authorids": "victor@victorzhong.com;cxiong@salesforce.com;richard@socher.org", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzhong2018seqsql,\ntitle={Seq2{SQL}: Generating Structured Queries From Natural Language Using Reinforcement Learning },\nauthor={Victor Zhong and Caiming Xiong and Richard Socher},\nyear={2018},\nurl={https://openreview.net/forum?id=Syx6bz-Ab},\n}", "github": "[![github](/images/github_icon.svg) salesforce/WikiSQL](https://github.com/salesforce/WikiSQL) + [![Papers with Code](/images/pwc_icon.svg) 14 community implementations](https://paperswithcode.com/paper/?openreview=Syx6bz-Ab)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Syx6bz-Ab", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;5", "rating_avg": 4.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 15, "authors#_avg": 3, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1367, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1449841204387852738&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7 }, { "id": "SyxCqGbRZ", "title": "Learning to Treat Sepsis with Multi-Output Gaussian Process Deep Recurrent Q-Networks", "track": "main", "status": "Reject", "tldr": "We combine Multi-output Gaussian processes with deep recurrent Q-networks to learn optimal treatments for sepsis and show improved performance over standard deep reinforcement learning methods,", "abstract": "Sepsis is a life-threatening complication from infection and a leading cause of mortality in hospitals. While early detection of sepsis improves patient outcomes, there is little consensus on exact treatment guidelines, and treating septic patients remains an open problem. In this work we present a new deep reinforcement learning method that we use to learn optimal personalized treatment policies for septic patients. We model patient continuous-valued physiological time series using multi-output Gaussian processes, a probabilistic model that easily handles missing values and irregularly spaced observation times while maintaining estimates of uncertainty. The Gaussian process is directly tied to a deep recurrent Q-network that learns clinically interpretable treatment policies, and both models are learned together end-to-end. We evaluate our approach on a heterogeneous dataset of septic spanning 15 months from our university health system, and find that our learned policy could reduce patient mortality by as much as 8.2\\% from an overall baseline mortality rate of 13.3\\%. Our algorithm could be used to make treatment recommendations to physicians as part of a decision support tool, and the framework readily applies to other reinforcement learning problems that rely on sparsely sampled and frequently missing multivariate time series data.\n", "keywords": "Healthcare;Gaussian Process;Deep Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Joseph Futoma;Anthony Lin;Mark Sendak;Armando Bedoya;Meredith Clement;Cara O'Brien;Katherine Heller", "authorids": "jfutoma14@gmail.com;anthony.lin@duke.edu;mark.sendak@duke.edu;armando.bedoya@duke.edu;meredith.edwards@duke.edu;cara.obrien@duke.edu;kheller@gmail.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nfutoma2018learning,\ntitle={Learning to Treat Sepsis with Multi-Output Gaussian Process Deep Recurrent Q-Networks},\nauthor={Joseph Futoma and Anthony Lin and Mark Sendak and Armando Bedoya and Meredith Clement and Cara O'Brien and Katherine Heller},\nyear={2018},\nurl={https://openreview.net/forum?id=SyxCqGbRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SyxCqGbRZ", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;4;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 7, "corr_rating_confidence": -0.944911182523068, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4847992067347216309&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "title": "Regularizing and Optimizing LSTM Language Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/316", "id": "SyyGPP0TZ", "author_site": "Stephen Merity, Nitish Shirish Keskar, richard socher", "tldr": "Effective regularization and optimization strategies for LSTM-based language models achieves SOTA on PTB and WT2. ", "abstract": "In this paper, we consider the specific problem of word-level language modeling and investigate strategies for regularizing and optimizing LSTM-based models. We propose the weight-dropped LSTM, which uses DropConnect on hidden-to-hidden weights, as a form of recurrent regularization. Further, we introduce NT-ASGD, a non-monotonically triggered (NT) variant of the averaged stochastic gradient method (ASGD), wherein the averaging trigger is determined using a NT condition as opposed to being tuned by the user. Using these and other regularization strategies, our ASGD Weight-Dropped LSTM (AWD-LSTM) achieves state-of-the-art word level perplexities on two data sets: 57.3 on Penn Treebank and 65.8 on WikiText-2. In exploring the effectiveness of a neural cache in conjunction with our proposed model, we achieve an even lower state-of-the-art perplexity of 52.8 on Penn Treebank and 52.0 on WikiText-2. We also explore the viability of the proposed regularization and optimization strategies in the context of the quasi-recurrent neural network (QRNN) and demonstrate comparable performance to the AWD-LSTM counterpart. The code for reproducing the results is open sourced and is available at https://github.com/salesforce/awd-lstm-lm.", "keywords": "language model;LSTM;regularization;optimization;ASGD;dropconnect", "primary_area": "", "supplementary_material": "", "author": "Stephen Merity;Nitish Shirish Keskar;Richard Socher", "authorids": "smerity@smerity.com;keskar.nitish@u.northwestern.edu;richard@socher.org", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nmerity2018regularizing,\ntitle={Regularizing and Optimizing {LSTM} Language Models},\nauthor={Stephen Merity and Nitish Shirish Keskar and Richard Socher},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SyyGPP0TZ},\n}", "github": "[![github](/images/github_icon.svg) salesforce/awd-lstm-lm](https://github.com/salesforce/awd-lstm-lm) + [![Papers with Code](/images/pwc_icon.svg) 46 community implementations](https://paperswithcode.com/paper/?openreview=SyyGPP0TZ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;4;5", "rating_avg": 7.0, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 1415, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10613038919449342432&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=SyyGPP0TZ", "pdf": "https://openreview.net/pdf?id=SyyGPP0TZ", "email": ";;", "author_num": 3 }, { "title": "Backpropagation through the Void: Optimizing control variates for black-box gradient estimation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/192", "id": "SyzKd1bCW", "author_site": "Will Grathwohl, Dami Choi, Yuhuai Wu, Geoffrey Roeder, David Duvenaud", "tldr": "We present a general method for unbiased estimation of gradients of black-box functions of random variables. We apply this method to discrete variational inference and reinforcement learning. ", "abstract": "Gradient-based optimization is the foundation of deep learning and reinforcement learning.\nEven when the mechanism being optimized is unknown or not differentiable, optimization using high-variance or biased gradient estimates is still often the best strategy. We introduce a general framework for learning low-variance, unbiased gradient estimators for black-box functions of random variables, based on gradients of a learned function.\nThese estimators can be jointly trained with model parameters or policies, and are applicable in both discrete and continuous settings. We give unbiased, adaptive analogs of state-of-the-art reinforcement learning methods such as advantage actor-critic. We also demonstrate this framework for training discrete latent-variable models.", "keywords": "optimization;machine learning;variational inference;reinforcement learning;gradient estimation;deep learning;discrete optimization", "primary_area": "", "supplementary_material": "", "author": "Will Grathwohl;Dami Choi;Yuhuai Wu;Geoff Roeder;David Duvenaud", "authorids": "wgrathwohl@cs.toronto.edu;choidami@cs.toronto.edu;ywu@cs.toronto.edu;roeder@cs.toronto.edu;duvenaud@cs.toronto.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ngrathwohl2018backpropagation,\ntitle={Backpropagation through the Void: Optimizing control variates for black-box gradient estimation},\nauthor={Will Grathwohl and Dami Choi and Yuhuai Wu and Geoff Roeder and David Duvenaud},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=SyzKd1bCW},\n}", "github": "[![github](/images/github_icon.svg) duvenaud/relax](https://github.com/duvenaud/relax) + [![Papers with Code](/images/pwc_icon.svg) 6 community implementations](https://paperswithcode.com/paper/?openreview=SyzKd1bCW)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;8", "confidence": "2;3;4", "rating_avg": 7.0, "confidence_avg": 3.0, "replies_avg": 15, "authors#_avg": 5, "corr_rating_confidence": 1.0, "gs_citation": 353, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14404204871710653077&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=SyzKd1bCW", "pdf": "https://openreview.net/pdf?id=SyzKd1bCW", "email": ";;;;", "author_num": 5 }, { "id": "r1-4BLaQz", "title": "Cluster-based Warm-Start Nets", "track": "main", "status": "Withdraw", "tldr": "Cluster before you classify; using weak labels to improve classification ", "abstract": "Theories in cognitive psychology postulate that humans use similarity as a basis\nfor object categorization. However, work in image classification generally as-\nsumes disjoint and equally dissimilar classes to achieve super-human levels of\nperformance on certain datasets. In our work, we adapt notions of similarity using\nweak labels over multiple hierarchical levels to boost classification performance.\nInstead of pitting clustering directly against classification, we use a warm-start\nbased evaluation to explicitly provide value to a clustering representation by its\nability to aid classification. We evaluate on CIFAR10 and a fine-grained classifi-\ncation dataset to show improvements in performance with the procedural addition\nof intermediate losses and weak labels based on multiple hierarchy levels. Further-\nmore, we show that pretraining AlexNet on hierarchical weak labels in conjunc-\ntion with intermediate losses outperforms a classification baseline by over 17% on\na subset of Birdsnap dataset. Finally, we show improvement over AlexNet trained\nusing ImageNet pre-trained weights as initializations which further supports our \nclaim of the importance of similarity.", "keywords": "hierarchical labels;weak labels;pairwise constraints;clustering;classification", "primary_area": "", "supplementary_material": "", "author": "Anonymous", "authorids": "ICLR.cc/2018/Conference/Paper998/Authors", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@article{\n anonymous2018cluster-based,\n title={Cluster-based Warm-Start Nets},\n author={Anonymous},\n journal={International Conference on Learning Representations},\n year={2018},\n url={https://openreview.net/forum?id=ry7m6fZRW}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=r1-4BLaQz", "pdf_size": 0, "rating": "3;3;6", "confidence": "4;4;5", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 1, "corr_rating_confidence": 1.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "r111KtCp-", "title": "Taking Apart Autoencoders: How do They Encode Geometric Shapes ?", "track": "main", "status": "Reject", "tldr": "We study the functioning of autoencoders in a simple setting and advise new strategies for their regularisation in order to obtain bettre generalisation with latent interpolation in mind for image sythesis. ", "abstract": "We study the precise mechanisms which allow autoencoders to encode and decode a simple geometric shape, the disk. In this carefully controlled setting, we are able to describe the specific form of the optimal solution to the minimisation problem of the training step. We show that the autoencoder indeed approximates this solution during training. Secondly, we identify a clear failure in the generalisation capacity of the autoencoder, namely its inability to interpolate data. Finally, we explore several regularisation schemes to resolve the generalisation problem. Given the great attention that has been recently given to the generative capacity of neural networks, we believe that studying in depth simple geometric cases sheds some light on the generation process and can provide a minimal requirement experimental setup for more complex architectures. \n", "keywords": "autoencoders;CNN;image synthesis;latent space", "primary_area": "", "supplementary_material": "", "author": "Alasdair Newson;Andres Almansa;Yann Gousseau;Said Ladjal", "authorids": "alasdairnewson@gmail.com;andres.almansa@parisdescartes.fr;yann.gousseau@telecom-paristech.fr;said.ladjal@telecom-paristech.fr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nnewson2018taking,\ntitle={Taking Apart Autoencoders: How do They Encode Geometric Shapes ?},\nauthor={Alasdair Newson and Andres Almansa and Yann Gousseau and Said Ladjal},\nyear={2018},\nurl={https://openreview.net/forum?id=r111KtCp-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=r111KtCp-", "pdf_size": 0, "rating": "4;4;4", "confidence": "5;4;3", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=254518826431405750&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Auto-Conditioned Recurrent Networks for Extended Complex Human Motion Synthesis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/266", "id": "r11Q2SlRW", "author_site": "Yi Zhou, Zimo Li, Shuangjiu Xiao, Chong He, Zeng Huang, Hao Li", "tldr": "Synthesize complex and extended human motions using an auto-conditioned LSTM network", "abstract": "We present a real-time method for synthesizing highly complex human motions using a novel training regime we call the auto-conditioned Recurrent Neural Network (acRNN). Recently, researchers have attempted to synthesize new motion by using autoregressive techniques, but existing methods tend to freeze or diverge after a couple of seconds due to an accumulation of errors that are fed back into the network. Furthermore, such methods have only been shown to be reliable for relatively simple human motions, such as walking or running. In contrast, our approach can synthesize arbitrary motions with highly complex styles, including dances or martial arts in addition to locomotion. The acRNN is able to accomplish this by explicitly accommodating for autoregressive noise accumulation during training. Our work is the first to our knowledge that demonstrates the ability to generate over 18,000 continuous frames (300 seconds) of new complex human motion w.r.t. different styles. ", "keywords": "motion synthesis;motion prediction;human pose;human motion;recurrent networks;lstm", "primary_area": "", "supplementary_material": "", "author": "Yi Zhou;Zimo Li;Shuangjiu Xiao;Chong He;Zeng Huang;Hao Li", "authorids": "zhou859@usc.edu;zimoli@usc.edu;xsjiu99@sjtu.edu.cn;sal@sjtu.edu.cn;zenghuang@usc.edu;hao@hao-li.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nzhou2018autoconditioned,\ntitle={Auto-Conditioned Recurrent Networks for Extended Complex Human Motion Synthesis},\nauthor={Yi Zhou and Zimo Li and Shuangjiu Xiao and Chong He and Zeng Huang and Hao Li},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=r11Q2SlRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "5;5;3", "rating_avg": 6.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 264, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5399907966651514315&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=r11Q2SlRW", "pdf": "https://openreview.net/pdf?id=r11Q2SlRW", "email": ";;;;;", "author_num": 6 }, { "id": "r154_g-Rb", "title": "Composable Planning with Attributes", "track": "main", "status": "Reject", "tldr": "Compositional attribute-based planning that generalizes to long test tasks, despite being trained on short & simple tasks.", "abstract": "The tasks that an agent will need to solve often aren\u2019t known during training. However, if the agent knows which properties of the environment we consider im- portant, then after learning how its actions affect those properties the agent may be able to use this knowledge to solve complex tasks without training specifi- cally for them. Towards this end, we consider a setup in which an environment is augmented with a set of user defined attributes that parameterize the features of interest. We propose a model that learns a policy for transitioning between \u201cnearby\u201d sets of attributes, and maintains a graph of possible transitions. Given a task at test time that can be expressed in terms of a target set of attributes, and a current state, our model infers the attributes of the current state and searches over paths through attribute space to get a high level plan, and then uses its low level policy to execute the plan. We show in grid-world games and 3D block stacking that our model is able to generalize to longer, more complex tasks at test time even when it only sees short, simple tasks at train time.\n", "keywords": "Planning;Compositionality;Attributes;Reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Amy Zhang;Adam Lerer;Sainbayar Sukhbaatar;Rob Fergus;Arthur Szlam", "authorids": "amyzhang@fb.com;alerer@fb.com;sainbar@cs.nyu.edu;fergus@cs.nyu.edu;aszlam@fb.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nzhang2018composable,\ntitle={Composable Planning with Attributes},\nauthor={Amy Zhang and Adam Lerer and Sainbayar Sukhbaatar and Rob Fergus and Arthur Szlam},\nyear={2018},\nurl={https://openreview.net/forum?id=r154_g-Rb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=r154_g-Rb", "pdf_size": 0, "rating": "4;5;7", "confidence": "5;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 5, "corr_rating_confidence": -0.9819805060619659, "gs_citation": 89, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14836553939348389093&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "r15kjpHa-", "title": "Reward Design in Cooperative Multi-agent Reinforcement Learning for Packet Routing", "track": "main", "status": "Reject", "tldr": "We study reward design problem in cooperative MARL based on packet routing environments. The experimental results remind us to be careful to design the rewards, as they are really important to guide the agent behavior.", "abstract": "In cooperative multi-agent reinforcement learning (MARL), how to design a suitable reward signal to accelerate learning and stabilize convergence is a critical problem. The global reward signal assigns the same global reward to all agents without distinguishing their contributions, while the local reward signal provides different local rewards to each agent based solely on individual behavior. Both of the two reward assignment approaches have some shortcomings: the former might encourage lazy agents, while the latter might produce selfish agents.\n\nIn this paper, we study reward design problem in cooperative MARL based on packet routing environments. Firstly, we show that the above two reward signals are prone to produce suboptimal policies. Then, inspired by some observations and considerations, we design some mixed reward signals, which are off-the-shelf to learn better policies. Finally, we turn the mixed reward signals into the adaptive counterparts, which achieve best results in our experiments. Other reward signals are also discussed in this paper. As reward design is a very fundamental problem in RL and especially in MARL, we hope that MARL researchers can rethink the rewards used in their systems.", "keywords": "Reward Design;Cooperative Multi-agent Reinforcement Learning;Packet Routing", "primary_area": "", "supplementary_material": "", "author": "Hangyu Mao;Zhibo Gong;Zhen Xiao", "authorids": "pku.hy.mao@gmail.com;gongzhibo@huawei.com;gtxaio@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmao2018reward,\ntitle={Reward Design in Cooperative Multi-agent Reinforcement Learning for Packet Routing},\nauthor={Hangyu Mao and Zhibo Gong and Zhen Xiao},\nyear={2018},\nurl={https://openreview.net/forum?id=r15kjpHa-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=r15kjpHa-", "pdf_size": 0, "rating": "2;5;5", "confidence": "4;2;3", "rating_avg": 4.0, "confidence_avg": 3.0, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844387, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9971627554353168372&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "r16Vyf-0-", "title": "Image Transformer", "track": "main", "status": "Reject", "tldr": "", "abstract": "Image generation has been successfully cast as an autoregressive sequence generation\nor transformation problem. Recent work has shown that self-attention is\nan effective way of modeling textual sequences. In this work, we generalize a\nrecently proposed model architecture based on self-attention, the Transformer, to\na sequence modeling formulation of image generation with a tractable likelihood.\nBy restricting the self-attention mechanism to attend to local neighborhoods we\nsignificantly increase the size of images the model can process in practice, despite\nmaintaining significantly larger receptive fields per layer than typical convolutional\nneural networks. We propose another extension of self-attention allowing it\nto efficiently take advantage of the two-dimensional nature of images.\nWhile conceptually simple, our generative models trained on two image data sets\nare competitive with or significantly outperform the current state of the art in autoregressive\nimage generation on two different data sets, CIFAR-10 and ImageNet.\nWe also present results on image super-resolution with a large magnification ratio,\napplying an encoder-decoder configuration of our architecture. In a human\nevaluation study, we show that our super-resolution models improve significantly\nover previously published autoregressive super-resolution models. Images they\ngenerate fool human observers three times more often than the previous state of\nthe art.", "keywords": "image generation;super-resolution;self-attention;transformer", "primary_area": "", "supplementary_material": "", "author": "Ashish Vaswani;Niki Parmar;Jakob Uszkoreit;Noam Shazeer;Lukasz Kaiser", "authorids": "avaswani@google.com;nikip@google.com;uszkoreit@google.com;noam@google.com;lukaszkaiser@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nvaswani2018image,\ntitle={Image Transformer},\nauthor={Ashish Vaswani and Niki Parmar and Jakob Uszkoreit and Noam Shazeer and Lukasz Kaiser},\nyear={2018},\nurl={https://openreview.net/forum?id=r16Vyf-0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=r16Vyf-0-", "pdf_size": 0, "rating": "3;5;6", "confidence": "3;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 5, "corr_rating_confidence": 0.9449111825230683, "gs_citation": 2281, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7958557148623619738&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "r16u6i_Xz", "title": "Interactive Boosting of Neural Networks for Small-sample Image Classification", "track": "main", "status": "Withdraw", "tldr": "In the paper, we proposed an ensemble method called InterBoost for training neural networks for small-sample classification. The method has better generalization performance than other ensemble methods, and reduces variances significantly.", "abstract": "Neural networks have recently shown excellent performance on numerous classi- fication tasks. These networks often have a large number of parameters and thus require much data to train. When the number of training data points is small, however, a network with high flexibility will quickly overfit the training data, resulting in a large model variance and a poor generalization performance. To address this problem, we propose a new ensemble learning method called InterBoost for small-sample image classification. In the training phase, InterBoost first randomly generates two complementary datasets to train two base networks of the same structure, separately, and then next two complementary datasets for further training the networks are generated through interaction (or information sharing) between the two base networks trained previously. This interactive training process continues iteratively until a stop criterion is met. In the testing phase, the outputs of the two networks are combined to obtain one final score for classification. Detailed analysis of the method is provided for an in-depth understanding of its mechanism.", "keywords": "ensemble learning;neural network;small-sample;overfitting;variance", "primary_area": "", "supplementary_material": "", "author": "Xiaoxu Li;Dongliang Chang;Zheng-Hua Tan;Zhanyu Ma;Jun Guo;Jie Cao", "authorids": "xiaoxulilut@gmail.com;dlchanglut@hotmai.com;zt@es.aau.dk;mazhanyu@bupt.edu.cn;guojun@bupt.edu.cn;caoj@lut.cn", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r16u6i_Xz", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;5;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 6, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9NV_cDgfH_UJ:scholar.google.com/&scioq=Interactive+Boosting+of+Neural+Networks+for+Small-sample+Image+Classification&hl=en&as_sdt=0,5", "gs_version_total": 2 }, { "id": "r17Q6WWA-", "title": "Multi-Task Learning by Deep Collaboration and Application in Facial Landmark Detection", "track": "main", "status": "Reject", "tldr": "We propose a novel approach for connecting task-specific networks in a multi-task learning setting based on recent residual network advances.", "abstract": "Convolutional neural networks (CNN) have become the most successful and popular approach in many vision-related domains. While CNNs are particularly well-suited for capturing a proper hierarchy of concepts from real-world images, they are limited to domains where data is abundant. Recent attempts have looked into mitigating this data scarcity problem by casting their original single-task problem into a new multi-task learning (MTL) problem. The main goal of this inductive transfer mechanism is to leverage domain-specific information from related tasks, in order to improve generalization on the main task. While recent results in the deep learning (DL) community have shown the promising potential of training task-specific CNNs in a soft parameter sharing framework, integrating the recent DL advances for improving knowledge sharing is still an open problem. In this paper, we propose the Deep Collaboration Network (DCNet), a novel approach for connecting task-specific CNNs in a MTL framework. We define connectivity in terms of two distinct non-linear transformation blocks. One aggregates task-specific features into global features, while the other merges back the global features with each task-specific network. Based on the observation that task relevance depends on depth, our transformation blocks use skip connections as suggested by residual network approaches, to more easily deactivate unrelated task-dependent features. To validate our approach, we employed facial landmark detection (FLD) datasets as they are readily amenable to MTL, given the number of tasks they include. Experimental results show that we can achieve up to 24.31% relative improvement in landmark failure rate over other state-of-the-art MTL approaches. We finally perform an ablation study showing that our approach effectively allows knowledge sharing, by leveraging domain-specific features at particular depths from tasks that we know are related.", "keywords": "multi-task learning;soft parameter sharing;facial landmark detection", "primary_area": "", "supplementary_material": "", "author": "Ludovic Trottier;Philippe Gigu\u00e8re;Brahim Chaib-draa", "authorids": "ludovic.trottier.1@ulaval.ca;philippe.giguere@ift.ulaval.ca;brahim.chaib-draa@ift.ulaval.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ntrottier2018multitask,\ntitle={Multi-Task Learning by Deep Collaboration and Application in Facial Landmark Detection},\nauthor={Ludovic Trottier and Philippe Gigu\u00e8re and Brahim Chaib-draa},\nyear={2018},\nurl={https://openreview.net/forum?id=r17Q6WWA-},\n}", "github": "[![github](/images/github_icon.svg) ltrottier/deep-collaboration-network](https://github.com/ltrottier/deep-collaboration-network)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r17Q6WWA-", "pdf_size": 0, "rating": "5;6;6", "confidence": "5;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10269821678777814605&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "r17lFgZ0Z", "title": "Relevance of Unsupervised Metrics in Task-Oriented Dialogue for Evaluating Natural Language Generation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Automated metrics such as BLEU are widely used in the machine translation literature. They have also been used recently in the dialogue community for evaluating dialogue response generation. However, previous work in dialogue response generation has shown that these metrics do not correlate strongly with human judgment in the non task-oriented dialogue setting. Task-oriented dialogue responses are expressed on narrower domains and exhibit lower diversity. It is thus reasonable to think that these automated metrics would correlate well with human judgment in the task-oriented setting where the generation task consists of translating dialogue acts into a sentence. We conduct an empirical study to confirm whether this is the case. Our findings indicate that these automated metrics have stronger correlation with human judgments in the task-oriented setting compared to what has been observed in the non task-oriented setting. We also observe that these metrics correlate even better for datasets which provide multiple ground truth reference sentences. In addition, we show that some of the currently available corpora for task-oriented language generation can be solved with simple models and advocate for more challenging datasets.", "keywords": "task-oriented dialog;goal-oriented dialog;nlg evaluation;natural language generation;automated metrics for nlg", "primary_area": "", "supplementary_material": "", "author": "Shikhar Sharma;Layla El Asri;Hannes Schulz;Jeremie Zumer", "authorids": "shikhar.sharma@microsoft.com;layla.elasri@microsoft.com;hannes.schulz@microsoft.com;jeremie_zumer@hotmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsharma2018relevance,\ntitle={Relevance of Unsupervised Metrics in Task-Oriented Dialogue for Evaluating Natural Language Generation},\nauthor={Shikhar Sharma and Layla El Asri and Hannes Schulz and Jeremie Zumer},\nyear={2018},\nurl={https://openreview.net/forum?id=r17lFgZ0Z},\n}", "github": "[![github](/images/github_icon.svg) Maluuba/nlg-eval](https://github.com/Maluuba/nlg-eval) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=r17lFgZ0Z)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=r17lFgZ0Z", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 138, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5144419831954579407&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "r1AMITFaW", "title": "Dependent Bidirectional RNN with Extended-long Short-term Memory", "track": "main", "status": "Reject", "tldr": "A recurrent neural network cell with extended-long short-term memory and a multi-task RNN model for sequence-in-sequence-out problems", "abstract": "In this work, we first conduct mathematical analysis on the memory, which is\ndefined as a function that maps an element in a sequence to the current output,\nof three RNN cells; namely, the simple recurrent neural network (SRN), the long\nshort-term memory (LSTM) and the gated recurrent unit (GRU). Based on the\nanalysis, we propose a new design, called the extended-long short-term memory\n(ELSTM), to extend the memory length of a cell. Next, we present a multi-task\nRNN model that is robust to previous erroneous predictions, called the dependent\nbidirectional recurrent neural network (DBRNN), for the sequence-in-sequenceout\n(SISO) problem. Finally, the performance of the DBRNN model with the\nELSTM cell is demonstrated by experimental results.", "keywords": "RNN;memory;LSTM;GRU;BRNN;encoder-decoder;Natural language processing", "primary_area": "", "supplementary_material": "", "author": "Yuanhang Su;Yuzhong Huang;C.-C. Jay Kuo", "authorids": "yuanhans@usc.edu;yuzhongh@usc.edu;cckuo@sipi.usc.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsu2018dependent,\ntitle={Dependent Bidirectional {RNN} with Extended-long Short-term Memory},\nauthor={Yuanhang Su and Yuzhong Huang and C.-C. Jay Kuo},\nyear={2018},\nurl={https://openreview.net/forum?id=r1AMITFaW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1AMITFaW", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.0, "replies_avg": 15, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1884839025736970940&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "r1AoGNlC-", "title": "Code Synthesis with Priority Queue Training", "track": "main", "status": "Reject", "tldr": "We use a simple search algorithm involving an RNN and priority queue to find solutions to coding tasks.", "abstract": "We consider the task of program synthesis in the presence of a reward function over the output of programs, where the goal is to find programs with maximal rewards. We introduce a novel iterative optimization scheme, where we train an RNN on a dataset of K best programs from a priority queue of the generated programs so far. Then, we synthesize new programs and add them to the priority queue by sampling from the RNN. We benchmark our algorithm called priority queue training (PQT) against genetic algorithm and reinforcement learning baselines on a simple but expressive Turing complete programming language called BF. Our experimental results show that our deceptively simple PQT algorithm significantly outperforms the baselines. By adding a program length penalty to the reward function, we are able to synthesize short, human readable programs.", "keywords": "code synthesis;program synthesis;genetic algorithm;reinforcement learning;policy gradient;reinforce;priority queue;topk buffer;bf;code golf;rnn", "primary_area": "", "supplementary_material": "", "author": "Daniel A. Abolafia;Quoc V. Le;Mohammad Norouzi", "authorids": "danabo@google.com;qvl@google.com;mnorouzi@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\na.2018code,\ntitle={Code Synthesis with Priority Queue Training},\nauthor={Daniel A. Abolafia and Quoc V. Le and Mohammad Norouzi},\nyear={2018},\nurl={https://openreview.net/forum?id=r1AoGNlC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1AoGNlC-", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dmSDLcgv9iwJ:scholar.google.com/&scioq=Code+Synthesis+with+Priority+Queue+Training&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "r1BRfhiab", "title": "The Principle of Logit Separation", "track": "main", "status": "Reject", "tldr": "", "abstract": "We consider neural network training, in applications in which there are many possible classes, but at test-time, the task is to identify only whether the given example belongs to a specific class, which can be different in different applications of the classifier. For instance, this is the case in an image search engine. We consider the Single Logit Classification (SLC) task: training the network so that at test-time, it would be possible to accurately identify if the example belongs to a given class, based only on the output logit for this class. \nWe propose a natural principle, the Principle of Logit Separation, as a guideline for choosing and designing losses suitable for the SLC. \nWe show that the cross-entropy loss function is not aligned with the Principle of Logit Separation. In contrast, there are known loss functions, as well as novel batch loss functions that we propose, which are aligned with this principle. In total, we study seven loss functions. \nOur experiments show that indeed in almost all cases, losses that are aligned with Principle of Logit Separation obtain a 20%-35% relative performance improvement in the SLC task, compared to losses that are not aligned with it. We therefore conclude that the Principle of Logit Separation sheds light on an important property of the most common loss functions used by neural network classifiers. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gil Keren;Sivan Sabato;Bj\u00f6rn Schuller", "authorids": "cruvadom@gmail.com;sivan.sabato@gmail.com;bjoern.schuller@imperial.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nkeren2018the,\ntitle={The Principle of Logit Separation},\nauthor={Gil Keren and Sivan Sabato and Bj\u00f6rn Schuller},\nyear={2018},\nurl={https://openreview.net/forum?id=r1BRfhiab},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=r1BRfhiab", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;4;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 6, "authors#_avg": 3, "corr_rating_confidence": -0.944911182523068, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "r1CE9GWR-", "title": "Understanding GANs: the LQG Setting", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generative Adversarial Networks (GANs) have become a popular method to learn a probability model from data. Many GAN architectures with different optimization metrics have been introduced recently. Instead of proposing yet another architecture, this paper aims to provide an understanding of some of the basic issues surrounding GANs. First, we propose a natural way of specifying the loss function for GANs by drawing a connection with supervised learning. Second, we shed light on the statistical performance of GANs through the analysis of a simple LQG setting: the generator is linear, the loss function is quadratic and the data is drawn from a Gaussian distribution. We show that in this setting: 1) the optimal GAN solution converges to population Principal Component Analysis (PCA) as the number of training samples increases; 2) the number of samples required scales exponentially with the dimension of the data; 3) the number of samples scales almost linearly if the discriminator is constrained to be quadratic. Moreover, under this quadratic constraint on the discriminator, the optimal finite-sample GAN performs simply empirical PCA.", "keywords": "Generative Adversarial Networks;Wasserstein;Generalization;PCA", "primary_area": "", "supplementary_material": "", "author": "Soheil Feizi;Changho Suh;Fei Xia;David Tse", "authorids": "sfeizi@stanford.edu;chsuh@kaist.ac.kr;feixia@stanford.edu;dntse@stanford.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nfeizi2018understanding,\ntitle={Understanding {GAN}s: the {LQG} Setting},\nauthor={Soheil Feizi and Changho Suh and Fei Xia and David Tse},\nyear={2018},\nurl={https://openreview.net/forum?id=r1CE9GWR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1CE9GWR-", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;5;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 14, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17144026512307632263&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "r1D4bs1Wz", "title": "Dense Transformer Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The key idea of current deep learning methods for dense prediction\nis to apply a model on a regular patch centered on each pixel to\nmake pixel-wise predictions. These methods are limited in the sense\nthat the patches are determined by network architecture instead of\nlearned from data. In this work, we propose the dense transformer\nnetworks, which can learn the shapes and sizes of patches from data.\nThe dense transformer networks employ an encoder-decoder\narchitecture, and a pair of dense transformer modules are inserted\ninto each of the encoder and decoder paths. The novelty of this work\nis that we provide technical solutions for learning the shapes and\nsizes of patches from data and efficiently restoring the spatial\ncorrespondence required for dense prediction. The proposed dense\ntransformer modules are differentiable, thus the entire network can\nbe trained. We apply the proposed networks on natural and biological\nimage segmentation tasks and show superior performance is achieved\nin comparison to baseline methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jun Li;Yongjun Chen;Lei Cai;Ian Davidson;Shuiwang Ji", "authorids": "jun.li3@wsu.edu;yongjun.chen@wsu.edu;lei.cai@wsu.edu;davidson@cs.ucdavis.edu;sji@eecs.wsu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1D4bs1Wz", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 3, "authors#_avg": 5, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1330525427380290803&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "r1DPFCyA-", "title": "Discriminative k-shot learning using probabilistic models", "track": "main", "status": "Reject", "tldr": "This paper introduces a probabilistic framework for k-shot image classification that achieves state-of-the-art results", "abstract": "This paper introduces a probabilistic framework for k-shot image classification. The goal is to generalise from an initial large-scale classification task to a separate task comprising new classes and small numbers of examples. The new approach not only leverages the feature-based representation learned by a neural network from the initial task (representational transfer), but also information about the classes (concept transfer). The concept information is encapsulated in a probabilistic model for the final layer weights of the neural network which acts as a prior for probabilistic k-shot learning. We show that even a simple probabilistic model achieves state-of-the-art on a standard k-shot learning dataset by a large margin. Moreover, it is able to accurately model uncertainty, leading to well calibrated classifiers, and is easily extensible and flexible, unlike many recent approaches to k-shot learning.", "keywords": "discriminative k-shot learning;probabilistic inference", "primary_area": "", "supplementary_material": "", "author": "Matthias Bauer;Mateo Rojas-Carulla;Jakub Bart\u0142omiej \u015awi\u0105tkowski;Bernhard Sch\u00f6lkopf;Richard E. Turner", "authorids": "msb55@cam.ac.uk;mrojascarulla@gmail.com;kuba.swiatkowski@gmail.com;bs@tuebingen.mpg.de;ret26@cam.ac.uk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nbauer2018discriminative,\ntitle={Discriminative k-shot learning using probabilistic models},\nauthor={Matthias Bauer and Mateo Rojas-Carulla and Jakub Bart\u0142omiej \u015awi\u0105tkowski and Bernhard Sch\u00f6lkopf and Richard E. Turner},\nyear={2018},\nurl={https://openreview.net/forum?id=r1DPFCyA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1DPFCyA-", "pdf_size": 0, "rating": "5;5;5", "confidence": "3;3;3", "rating_avg": 5.0, "confidence_avg": 3.0, "replies_avg": 8, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 85, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12518975774456633060&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "title": "mixup: Beyond Empirical Risk Minimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/177", "id": "r1Ddp1-Rb", "author_site": "Hongyi Zhang, Moustapha Cisse, Yann N Dauphin, David Lopez-Paz", "tldr": "Training on convex combinations between random training examples and their labels improves generalization in deep neural networks", "abstract": "Large deep neural networks are powerful, but exhibit undesirable behaviors such as memorization and sensitivity to adversarial examples. In this work, we propose mixup, a simple learning principle to alleviate these issues. In essence, mixup trains a neural network on convex combinations of pairs of examples and their labels. By doing so, mixup regularizes the neural network to favor simple linear behavior in-between training examples. Our experiments on the ImageNet-2012, CIFAR-10, CIFAR-100, Google commands and UCI datasets show that mixup improves the generalization of state-of-the-art neural network architectures. We also find that mixup reduces the memorization of corrupt labels, increases the robustness to adversarial examples, and stabilizes the training of generative adversarial networks.", "keywords": "empirical risk minimization;vicinal risk minimization;generalization;data augmentation;image classification;generative adversarial networks;adversarial examples;random labels", "primary_area": "", "supplementary_material": "", "author": "Hongyi Zhang;Moustapha Cisse;Yann N. Dauphin;David Lopez-Paz", "authorids": "hongyiz@mit.edu;moustaphacisse@fb.com;ynd@fb.com;dlp@fb.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nzhang2018mixup,\ntitle={mixup: Beyond Empirical Risk Minimization},\nauthor={Hongyi Zhang and Moustapha Cisse and Yann N. Dauphin and David Lopez-Paz},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=r1Ddp1-Rb},\n}", "github": "[![github](/images/github_icon.svg) facebookresearch/mixup-cifar10](https://github.com/facebookresearch/mixup-cifar10) + [![Papers with Code](/images/pwc_icon.svg) 70 community implementations](https://paperswithcode.com/paper/?openreview=r1Ddp1-Rb)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 12891, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12669856454801555406&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=r1Ddp1-Rb", "pdf": "https://openreview.net/pdf?id=r1Ddp1-Rb", "email": ";;;", "author_num": 4 }, { "title": "Generalizing Across Domains via Cross-Gradient Training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/83", "id": "r1Dx7fbCW", "author_site": "Shiv Shankar, Vihari Piratla, Soumen Chakrabarti, Siddhartha Chaudhuri, Preethi Jyothi, Sunita Sarawagi", "tldr": "Domain guided augmentation of data provides a robust and stable method of domain generalization", "abstract": "We present CROSSGRAD , a method to use multi-domain training data to learn a classifier that generalizes to new domains. CROSSGRAD does not need an adaptation phase via labeled or unlabeled data, or domain features in the new domain. Most existing domain adaptation methods attempt to erase domain signals using techniques like domain adversarial training. In contrast, CROSSGRAD is free to use domain signals for predicting labels, if it can prevent overfitting on training domains. We conceptualize the task in a Bayesian setting, in which a sampling step is implemented as data augmentation, based on domain-guided perturbations of input instances. CROSSGRAD jointly trains a label and a domain classifier on examples perturbed by loss gradients of each other\u2019s objectives. This enables us to directly perturb inputs, without separating and re-mixing domain signals while making various distributional assumptions. Empirical evaluation on three different applications where this setting is natural establishes that\n (1) domain-guided perturbation provides consistently better generalization to unseen domains, compared to generic instance perturbation methods, and \n(2) data augmentation is a more stable and accurate method than domain adversarial training.", "keywords": "domain generalization;domain adaptation;adversarial learning;adversarial examples", "primary_area": "", "supplementary_material": "", "author": "Shiv Shankar*;Vihari Piratla*;Soumen Chakrabarti;Siddhartha Chaudhuri;Preethi Jyothi;Sunita Sarawagi", "authorids": "shivshankariitb@gmail.com;viharipiratla@gmail.com;soumen@cse.iitb.ac.in;sidch@cse.iitb.ac.in;pjyothi@cse.iitb.ac.in;sunita@iitb.ac.in", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nshankar2018generalizing,\ntitle={Generalizing Across Domains via Cross-Gradient Training},\nauthor={Shiv Shankar and Vihari Piratla and Soumen Chakrabarti and Siddhartha Chaudhuri and Preethi Jyothi and Sunita Sarawagi},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=r1Dx7fbCW},\n}", "github": "[![github](/images/github_icon.svg) vihari/crossgrad](https://github.com/vihari/crossgrad)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "7;7;7;8", "confidence": "4;5;5;4", "rating_avg": 7.25, "confidence_avg": 4.5, "replies_avg": 12, "authors#_avg": 6, "corr_rating_confidence": -0.5773502691896257, "gs_citation": 645, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4167124586655060881&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=r1Dx7fbCW", "pdf": "https://openreview.net/pdf?id=r1Dx7fbCW", "email": ";;;;;", "author_num": 6 }, { "id": "r1HNP0eCW", "title": "Estimation of cross-lingual news similarities using text-mining methods", "track": "main", "status": "Reject", "tldr": "", "abstract": "Every second, innumerable text data, including all kinds news, reports, messages, reviews, comments, and twits have been generated on the Internet, which is written not only in English but also in other languages such as Chinese, Japanese, French and so on. Not only SNS sites but also worldwide news agency such as Thomson Reuters News provide news reported in more than 20 languages, reflecting the significance of the multilingual information.\nIn this research, by taking advantage of multi-lingual text resources provided by the Thomson Reuters News, we developed a bidirectional LSTM based method to calculate cross-lingual semantic text similarity for long text and short text respectively. Thus, users could understand the situation comprehensively, by investigating similar and related cross-lingual articles, when there an important news comes in.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhouhao Wang;Enda Liu;Hiroki Sakaji;Tomoki Ito;Kiyoshi Izumi;Kota Tsubouchi;Tatsuo Yamashita", "authorids": "wangzhouhao94@gmail.com;m2015eliu@socsim.org;sakaji@sys.t.u-tokyo.ac.jp;m2015titoh@socsim.org;izumi@sys.t.u-tokyo.ac.jp;ktsubouc@yahoo-corp.jp;tayamash@yahoo-corp.jp", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nwang2018estimation,\ntitle={Estimation of cross-lingual news similarities using text-mining methods},\nauthor={Zhouhao Wang and Enda Liu and Hiroki Sakaji and Tomoki Ito and Kiyoshi Izumi and Kota Tsubouchi and Tatsuo Yamashita},\nyear={2018},\nurl={https://openreview.net/forum?id=r1HNP0eCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1HNP0eCW", "pdf_size": 0, "rating": "2;2;6", "confidence": "5;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 7, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7564743270389117704&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12 }, { "title": "Learning Awareness Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/16", "id": "r1HhRfWRZ", "author_site": "Brandon Amos, Laurent Dinh, Serkan Cabi, Thomas Roth\u00f6rl, Sergio G\u00f3mez Colmenarejo, Alistair Muldal, Tom Erez, Yuval Tassa, Nando de Freitas, Misha Denil", "tldr": "We train predictive models on proprioceptive information and show they represent properties of external objects.", "abstract": "We consider the setting of an agent with a fixed body interacting with an unknown and uncertain external world. We show that models trained to predict proprioceptive information about the agent's body come to represent objects in the external world. In spite of being trained with only internally available signals, these dynamic body models come to represent external objects through the necessity of predicting their effects on the agent's own body. That is, the model learns holistic persistent representations of objects in the world, even though the only training signals are body signals. Our dynamics model is able to successfully predict distributions over 132 sensor readings over 100 steps into the future and we demonstrate that even when the body is no longer in contact with an object, the latent variables of the dynamics model continue to represent its shape. We show that active data collection by maximizing the entropy of predictions about the body---touch sensors, proprioception and vestibular information---leads to learning of dynamic models that show superior performance when used for control. We also collect data from a real robotic hand and show that the same models can be used to answer questions about properties of objects in the real world. Videos with qualitative results of our models are available at https://goo.gl/mZuqAV.", "keywords": "Awareness;Prediction;Seq2seq;Robots", "primary_area": "", "supplementary_material": "", "author": "Brandon Amos;Laurent Dinh;Serkan Cabi;Thomas Roth\u00f6rl;Sergio G\u00f3mez Colmenarejo;Alistair Muldal;Tom Erez;Yuval Tassa;Nando de Freitas;Misha Denil", "authorids": "bamos@cs.cmu.edu;dinh.laurent@gmail.com;cabi@google.com;tcr@google.com;sergomez@google.com;alimuldal@google.com;etom@google.com;tassa@google.com;nandodefreitas@google.com;mdenil@google.com", "gender": ";;;;;;;;;", "homepage": ";;;;;;;;;", "dblp": ";;;;;;;;;", "google_scholar": ";;;;;;;;;", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;;;", "or_profile": ";;;;;;;;;", "aff": ";;;;;;;;;", "aff_domain": ";;;;;;;;;", "position": ";;;;;;;;;", "bibtex": "@inproceedings{\namos2018learning,\ntitle={Learning Awareness Models},\nauthor={Brandon Amos and Laurent Dinh and Serkan Cabi and Thomas Roth\u00f6rl and Alistair Muldal and Tom Erez and Yuval Tassa and Nando de Freitas and Misha Denil},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=r1HhRfWRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "4;4;7", "confidence": "5;4;4", "rating_avg": 5.0, "confidence_avg": 4.333333333333333, "replies_avg": 11, "authors#_avg": 10, "corr_rating_confidence": -0.5, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3184887201063924781&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=r1HhRfWRZ", "pdf": "https://openreview.net/pdf?id=r1HhRfWRZ", "email": ";;;;;;;;;", "author_num": 10 }, { "id": "r1ISxGZRb", "title": "Generation and Consolidation of Recollections for Efficient Deep Lifelong Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep lifelong learning systems need to efficiently manage resources to scale to large numbers of experiences and non-stationary goals. In this paper, we explore the relationship between lossy compression and the resource constrained lifelong learning problem of function transferability. We demonstrate that lossy episodic experience storage can enable efficient function transferability between different architectures and algorithms at a fraction of the storage cost of lossless storage. This is achieved by introducing a generative knowledge distillation strategy that does not store any full training examples. As an important extension of this idea, we show that lossy recollections stabilize deep networks much better than lossless sampling in resource constrained settings of lifelong learning while avoiding catastrophic forgetting. For this setting, we propose a novel dual purpose recollection buffer used to both stabilize the recollection generator itself and an accompanying reasoning model. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Matt Riemer;Michele Franceschini;and Tim Klinger", "authorids": "mdriemer@us.ibm.com;franceschini@us.ibm.com;tklinger@us.ibm.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nriemer2018generation,\ntitle={Generation and Consolidation of Recollections for Efficient Deep Lifelong Learning},\nauthor={Matt Riemer and Michele Franceschini and and Tim Klinger},\nyear={2018},\nurl={https://openreview.net/forum?id=r1ISxGZRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=r1ISxGZRb", "pdf_size": 0, "rating": "5;5;5", "confidence": "2;3;3", "rating_avg": 5.0, "confidence_avg": 2.6666666666666665, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10602040730586137771&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "r1Kr3TyAb", "title": "ANALYSIS ON GRADIENT PROPAGATION IN BATCH NORMALIZED RESIDUAL NETWORKS", "track": "main", "status": "Reject", "tldr": "Batch normalisation maintains gradient variance throughout training, thus stabilizing optimization.", "abstract": "We conduct a mathematical analysis on the Batch normalization (BN) effect on gradient backpropagation in residual network training in this work, which is believed to play a critical role in addressing the gradient vanishing/explosion problem. Specifically, by analyzing the mean and variance behavior of the input and the gradient in the forward and backward passes through the BN and residual branches, respectively, we show that they work together to confine the gradient variance to a certain range across residual blocks in backpropagation. As a result, the gradient vanishing/explosion problem is avoided. Furthermore, we use the same analysis to discuss the tradeoff between depth and width of a residual network and demonstrate that shallower yet wider resnets have stronger learning performance than deeper yet thinner resnets.", "keywords": "Batch normalization;gradient backpropagation;Residual network;wide residual network", "primary_area": "", "supplementary_material": "", "author": "Abhishek Panigrahi;Yueru Chen;C.-C. Jay Kuo", "authorids": "abhishekpanigrahi@iitkgp.ac.in;yueruche@usc.edu;cckuo@sipi.usc.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\npanigrahi2018analysis,\ntitle={{ANALYSIS} {ON} {GRADIENT} {PROPAGATION} {IN} {BATCH} {NORMALIZED} {RESIDUAL} {NETWORKS}},\nauthor={Abhishek Panigrahi and Yueru Chen and C.-C. Jay Kuo},\nyear={2018},\nurl={https://openreview.net/forum?id=r1Kr3TyAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1Kr3TyAb", "pdf_size": 0, "rating": "1;4;4", "confidence": "5;4;5", "rating_avg": 3.0, "confidence_avg": 4.666666666666667, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": -0.5, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1676794445313339945&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Generative networks as inverse problems with Scattering transforms", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/36", "id": "r1NYjfbR-", "author_site": "Tom\u00e1s Angles, St\u00e9phane Mallat", "tldr": "We introduce generative networks that do not require to be learned with a discriminator or an encoder; they are obtained by inverting a special embedding operator defined by a wavelet Scattering transform.", "abstract": "Generative Adversarial Nets (GANs) and Variational Auto-Encoders (VAEs) provide impressive image generations from Gaussian white noise, but the underlying mathematics are not well understood. We compute deep convolutional network generators by inverting a fixed embedding operator. Therefore, they do not require to be optimized with a discriminator or an encoder. The embedding is Lipschitz continuous to deformations so that generators transform linear interpolations between input white noise vectors into deformations between output images. This embedding is computed with a wavelet Scattering transform. Numerical experiments demonstrate that the resulting Scattering generators have similar properties as GANs or VAEs, without learning a discriminative network or an encoder.", "keywords": "Unsupervised Learning;Inverse Problems;Convolutional Networks;Generative Models;Scattering Transform", "primary_area": "", "supplementary_material": "", "author": "Tom\u00e1s Angles;St\u00e9phane Mallat", "authorids": "tomas.angles@ens.fr;stephane.mallat@ens.fr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nangles2018generative,\ntitle={Generative networks as inverse problems with scattering transforms},\nauthor={Tom\u00e1s Angles and St\u00e9phane Mallat},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=r1NYjfbR-},\n}", "github": "[![github](/images/github_icon.svg) tomas-angles/generative-scattering-networks](https://github.com/tomas-angles/generative-scattering-networks)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;4;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2488553421180641259&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=r1NYjfbR-", "pdf": "https://openreview.net/pdf?id=r1NYjfbR-", "email": ";", "author_num": 2 }, { "id": "r1O0xvz-z", "title": "Deep Hyperspherical Defense against Adversarial Perturbations", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent studies show that deep neural networks are extremely vulnerable to adversarial examples which are semantically indistinguishable from natural data and yet incorrectly classified. These adversarial examples are generated from the natural data by adding a small amount of adversarial perturbation. This paper tackles the adversarial attack problem with hyperspherical defense - a defense strategy that learns neural network over hyperspheres. The hyperspherical defense framework is well motivated by: (i) Learning on hyperspheres gives us bounded output, which may make the geometry of neural networks more smooth; (ii) Learning on hyperspheres could naturally eliminate some adversarial perturbations and reduce the effect of adversarial perturbations; (iii) Representing data on hyperspheres selectively drops some information of the inputs, but these information are shown to be not crucial to visual recognition (based on the fact that hyperspherical neural network performs comparable to or even better than standard neural networks in visual recognition). Furthermore, we introduce the hyperspherical compactness and propose a robust geodesic inference. We also provide theoretical insights about why our hyperspherical defense improves robustness. Last, we validate the superiority of hyperspherical defense with extensive experiments on both white-box and black-box adversarial attacks on multiple datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weiyang Liu;Zhen Liu;Zhehui Chen;Bo Dai;Tuo Zhao;Le Song", "authorids": "wyliu@gatech.edu;liuzhen1994@gatech.edu;zchen451@gatech.edu;bohr.dai@gmail.com;tourzhao@gatech.edu;lsong@cc.gatech.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=r1O0xvz-z", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 3, "authors#_avg": 6, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3177781369064645620&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "r1Oen--RW", "title": "The (Un)reliability of saliency methods", "track": "main", "status": "Reject", "tldr": "Attribution can sometimes be misleading", "abstract": "Saliency methods aim to explain the predictions of deep neural networks. These methods lack reliability when the explanation is sensitive to factors that do not contribute to the model prediction. We use a simple and common pre-processing step ---adding a mean shift to the input data--- to show that a transformation with no effect on the model can cause numerous methods to incorrectly attribute. We define input invariance as the requirement that a saliency method mirror the sensitivity of the model with respect to transformations of the input. We show, through several examples, that saliency methods that do not satisfy a input invariance property are unreliable and can lead to misleading and inaccurate attribution.", "keywords": "Deep learning interpretability;understanding", "primary_area": "", "supplementary_material": "", "author": "Pieter-Jan Kindermans;Sara Hooker;Julius Adebayo;Kristof T. Sch\u00fctt;Maximilian Alber;Sven D\u00e4hne;Dumitru Erhan;Been Kim", "authorids": "pikinder@google.com;shooker@google.com;juliusad@google.com;kristof.schuett@tu-berlin.de;maximilian.aber@tu-berlin.de;sven.daehne@tu-berlin.de;dumitru@google.com;beenkim@google.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\nkindermans2018the,\ntitle={The (Un)reliability of saliency methods},\nauthor={Pieter-Jan Kindermans and Sara Hooker and Julius Adebayo and Kristof T. Sch\u00fctt and Maximilian Alber and Sven D\u00e4hne and Dumitru Erhan and Been Kim},\nyear={2018},\nurl={https://openreview.net/forum?id=r1Oen--RW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=r1Oen--RW", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 8, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 872, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14450824162613386137&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 14 }, { "id": "r1PaPUsXM", "title": "Deep Epitome for Unravelling Generalized Hamming Network: A Fuzzy Logic Interpretation of Deep Learning", "track": "main", "status": "Withdraw", "tldr": "bridge the gap in soft computing", "abstract": "This paper gives a rigorous analysis of trained Generalized Hamming Networks (GHN) proposed by Fan (2017) and discloses an interesting finding about GHNs, i.e. stacked convolution layers in a GHN is equivalent to a single yet wide convolution layer. The revealed equivalence, on the theoretical side, can be regarded as a constructive manifestation of the universal approximation theorem Cybenko (1989); Hornik (1991). In practice, it has profound and multi-fold implications. For network visualization, the constructed deep epitomes at each layer provide a visualization of network internal representation that does not rely on the input data. Moreover, deep epitomes allows the direct extraction of features in just one step, without resorting to regularized optimizations used in existing visualization tools.", "keywords": "deep learning;CNN;fuzzy logic;generalized hamming distance", "primary_area": "", "supplementary_material": "", "author": "Anonymous", "authorids": "ICLR.cc/2018/Conference/Paper167/Authors", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@article{\n anonymous2018deep,\n title={Deep Epitome for Unravelling Generalized Hamming Network: A Fuzzy Logic Interpretation of Deep Learning},\n author={Anonymous},\n journal={International Conference on Learning Representations},\n year={2018},\n url={https://openreview.net/forum?id=rybEe2JAZ}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1PaPUsXM", "pdf_size": 0, "rating": "3;4;7", "confidence": "3;4;2", "rating_avg": 4.666666666666667, "confidence_avg": 3.0, "replies_avg": 3, "authors#_avg": 1, "corr_rating_confidence": -0.7205766921228921, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17060753226505000925&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "r1QZ3zbAZ", "title": "Adversarial Examples for Natural Language Classification Problems", "track": "main", "status": "Reject", "tldr": "", "abstract": "Modern machine learning algorithms are often susceptible to adversarial examples \u2014 maliciously crafted inputs that are undetectable by humans but that fool the algorithm into producing undesirable behavior. In this work, we show that adversarial examples exist in natural language classification: we formalize the notion of an adversarial example in this setting and describe algorithms that construct such examples. Adversarial perturbations can be crafted for a wide range of tasks \u2014 including spam filtering, fake news detection, and sentiment analysis \u2014 and affect different models \u2014 convolutional and recurrent neural networks as well as linear classifiers to a lesser degree. Constructing an adversarial example involves replacing 10-30% of words in a sentence with synonyms that don\u2019t change its meaning. Up to 90% of input examples admit adversarial perturbations; furthermore, these perturbations retain a degree of transferability across models. Our findings demonstrate the existence of vulnerabilities in machine learning systems and hint at limitations in our understanding of classification algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Volodymyr Kuleshov;Shantanu Thakoor;Tingfung Lau;Stefano Ermon", "authorids": "vol.kuleshov@gmail.com;shanu.thakoor@gmail.com;ldf921@126.com;ermon@cs.stanford.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nkuleshov2018adversarial,\ntitle={Adversarial Examples for Natural Language Classification Problems},\nauthor={Volodymyr Kuleshov and Shantanu Thakoor and Tingfung Lau and Stefano Ermon},\nyear={2018},\nurl={https://openreview.net/forum?id=r1QZ3zbAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1QZ3zbAZ", "pdf_size": 0, "rating": "4;4;6", "confidence": "5;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 18, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 100, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12025933868326802065&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "r1RF3ExCb", "title": "Transformation Autoregressive Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "The fundamental task of general density estimation has been of keen interest to machine learning. Recent advances in density estimation have either: a) proposed using a flexible model to estimate the conditional factors of the chain rule; or b) used flexible, non-linear transformations of variables of a simple base distribution. Instead, this work jointly leverages transformations of variables and autoregressive conditional models, and proposes novel methods for both. We provide a deeper understanding of our models, showing a considerable improvement with our methods through a comprehensive study over both real world and synthetic data. Moreover, we illustrate the use of our models in outlier detection and image modeling task.", "keywords": "density estimation;autoregressive models;RNNs", "primary_area": "", "supplementary_material": "", "author": "Junier Oliva;Avinava Dubey;Barnab\u00e1s P\u00f3czos;Eric P. Xing;Jeff Schneider", "authorids": "joliva@cs.cmu.edu;akdubey@cs.cmu.edu;bapoczos@cs.cmu.edu;epxing@cs.cmu.edu;schneide@cs.cmu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\noliva2018transformation,\ntitle={Transformation Autoregressive Networks},\nauthor={Junier Oliva and Avinava Dubey and Barnab\u00e1s P\u00f3czos and Eric P. Xing and Jeff Schneider},\nyear={2018},\nurl={https://openreview.net/forum?id=r1RF3ExCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=r1RF3ExCb", "pdf_size": 0, "rating": "5;5;8", "confidence": "2;3;4", "rating_avg": 6.0, "confidence_avg": 3.0, "replies_avg": 10, "authors#_avg": 5, "corr_rating_confidence": 0.8660254037844387, "gs_citation": 100, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13773040729706836244&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "r1RQdCg0W", "title": "MACH: Embarrassingly parallel $K$-class classification in $O(d\\log{K})$ memory and $O(K\\log{K} + d\\log{K})$ time, instead of $O(Kd)$", "track": "main", "status": "Reject", "tldr": "How to Training 100,000 classes on a single GPU", "abstract": "We present Merged-Averaged Classifiers via Hashing (MACH) for $K$-classification with large $K$. Compared to traditional one-vs-all classifiers that require $O(Kd)$ memory and inference cost, MACH only need $O(d\\log{K})$ memory while only requiring $O(K\\log{K} + d\\log{K})$ operation for inference. MACH is the first generic $K$-classification algorithm, with provably theoretical guarantees, which requires $O(\\log{K})$ memory without any assumption on the relationship between classes. MACH uses universal hashing to reduce classification with a large number of classes to few independent classification task with very small (constant) number of classes. We provide theoretical quantification of accuracy-memory tradeoff by showing the first connection between extreme classification and heavy hitters. With MACH we can train ODP dataset with 100,000 classes and 400,000 features on a single Titan X GPU (12GB), with the classification accuracy of 19.28\\%, which is the best-reported accuracy on this dataset. Before this work, the best performing baseline is a one-vs-all classifier that requires 40 billion parameters (320 GB model size) and achieves 9\\% accuracy. In contrast, MACH can achieve 9\\% accuracy with 480x reduction in the model size (of mere 0.6GB). With MACH, we also demonstrate complete training of fine-grained imagenet dataset (compressed size 104GB), with 21,000 classes, on a single GPU.", "keywords": "Extreme Classification;Large-scale learning;hashing;GPU;High Performance Computing", "primary_area": "", "supplementary_material": "", "author": "Qixuan Huang;Anshumali Shrivastava;Yiqiu Wang", "authorids": "qh5@rice.edu;anshumali@rice.edu;yiqiu.wang@rice.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nhuang2018mach,\ntitle={{MACH}: Embarrassingly parallel $K$-class classification in $O(d\\log{K})$ memory and $O(K\\log{K} + d\\log{K})$ time, instead of $O(Kd)$},\nauthor={Qixuan Huang and Anshumali Shrivastava and Yiqiu Wang},\nyear={2018},\nurl={https://openreview.net/forum?id=r1RQdCg0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1RQdCg0W", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "Deep Sensing: Active Sensing using Multi-directional Recurrent Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/239", "id": "r1SnX5xCb", "author_site": "Jinsung Yoon, William R Zame, Mihaela v Schaar", "tldr": "", "abstract": "For every prediction we might wish to make, we must decide what to observe (what source of information) and when to observe it. Because making observations is costly, this decision must trade off the value of information against the cost of observation. Making observations (sensing) should be an active choice. To solve the problem of active sensing we develop a novel deep learning architecture: Deep Sensing. At training time, Deep Sensing learns how to issue predictions at various cost-performance points. To do this, it creates multiple representations at various performance levels associated with different measurement rates (costs). This requires learning how to estimate the value of real measurements vs. inferred measurements, which in turn requires learning how to infer missing (unobserved) measurements. To infer missing measurements, we develop a Multi-directional Recurrent Neural Network (M-RNN). An M-RNN differs from a bi-directional RNN in that it sequentially operates across streams in addition to within streams, and because the timing of inputs into the hidden layers is both lagged and advanced. At runtime, the operator prescribes a performance level or a cost constraint, and Deep Sensing determines what measurements to take and what to infer from those measurements, and then issues predictions. To demonstrate the power of our method, we apply it to two real-world medical datasets with significantly improved performance.", "keywords": "Active Sensing;Timely Prediction;Irregular Sampling;Missing Data", "primary_area": "", "supplementary_material": "", "author": "Jinsung Yoon;William R. Zame;Mihaela van der Schaar", "authorids": "jsyoon0823@gmail.com;zame@econ.ucla.edu;mihaela.vanderschaar@oxford-man.ox.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nyoon2018deep,\ntitle={Deep Sensing: Active Sensing using Multi-directional Recurrent Neural Networks},\nauthor={Jinsung Yoon and William R. Zame and Mihaela van der Schaar},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=r1SnX5xCb},\n}", "github": "[![github](/images/github_icon.svg) vanderschaarlab/mlforhealthlabpub](https://github.com/vanderschaarlab/mlforhealthlabpub/tree/main/alg/DeepSensing%20(MRNN))", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;4;4", "rating_avg": 7.0, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14422140547542510544&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=r1SnX5xCb", "pdf": "https://openreview.net/pdf?id=r1SnX5xCb", "email": ";;", "author_num": 3 }, { "id": "r1SuFjkRW", "title": "Discrete Sequential Prediction of Continuous Actions for Deep RL", "track": "main", "status": "Reject", "tldr": "A method to do Q-learning on continuous action spaces by predicting a sequence of discretized 1-D actions.", "abstract": "It has long been assumed that high dimensional continuous control problems cannot be solved effectively by discretizing individual dimensions of the action space due to the exponentially large number of bins over which policies would have to be learned. In this paper, we draw inspiration from the recent success of sequence-to-sequence models for structured prediction problems to develop policies over discretized spaces. Central to this method is the realization that complex functions over high dimensional spaces can be modeled by neural networks that predict one dimension at a time. Specifically, we show how Q-values and policies over continuous spaces can be modeled using a next step prediction model over discretized dimensions. With this parameterization, it is possible to both leverage the compositional structure of action spaces during learning, as well as compute maxima over action spaces (approximately). On a simple example task we demonstrate empirically that our method can perform global search, which effectively gets around the local optimization issues that plague DDPG. We apply the technique to off-policy (Q-learning) methods and show that our method can achieve the state-of-the-art for off-policy methods on several continuous control tasks.", "keywords": "Reinforcement learning;continuous control;deep learning", "primary_area": "", "supplementary_material": "", "author": "Luke Metz;Julian Ibarz;Navdeep Jaitly;James Davidson", "authorids": "lmetz@google.com;julianibarz@google.com;njaitly@google.com;jcdavidson@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nmetz2018discrete,\ntitle={Discrete Sequential Prediction of Continuous Actions for Deep {RL}},\nauthor={Luke Metz and Julian Ibarz and Navdeep Jaitly and James Davidson},\nyear={2018},\nurl={https://openreview.net/forum?id=r1SuFjkRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1SuFjkRW", "pdf_size": 0, "rating": "4;5;7", "confidence": "5;1;5", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.18898223650461357, "gs_citation": 156, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13277754803307475907&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "r1TA9ZbA-", "title": "Learning to search with MCTSnets", "track": "main", "status": "Reject", "tldr": "", "abstract": "Planning problems are among the most important and well-studied problems in artificial intelligence. They are most typically solved by tree search algorithms that simulate ahead into the future, evaluate future states, and back-up those evaluations to the root of a search tree. Among these algorithms, Monte-Carlo tree search (MCTS) is one of the most general, powerful and widely used. A typical implementation of MCTS uses cleverly designed rules, optimised to the particular characteristics of the domain. These rules control where the simulation traverses, what to evaluate in the states that are reached, and how to back-up those evaluations. In this paper we instead learn where, what and how to search. Our architecture, which we call an MCTSnet, incorporates simulation-based search inside a neural network, by expanding, evaluating and backing-up a vector embedding. The parameters of the network are trained end-to-end using gradient-based optimisation. When applied to small searches in the well-known planning problem Sokoban, the learned search algorithm significantly outperformed MCTS baselines. ", "keywords": "Monte-Carlo Tree Search;search;planning", "primary_area": "", "supplementary_material": "", "author": "Arthur Guez;Theophane Weber;Ioannis Antonoglou;Karen Simonyan;Oriol Vinyals;Daan Wierstra;Remi Munos;David Silver", "authorids": "aguez@google.com;theophane@google.com;ioannisa@google.com;simonyan@google.com;vinyals@google.com;wierstra@google.com;munos@google.com;davidsilver@google.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\nguez2018learning,\ntitle={Learning to search with {MCTS}nets},\nauthor={Arthur Guez and Theophane Weber and Ioannis Antonoglou and Karen Simonyan and Oriol Vinyals and Daan Wierstra and Remi Munos and David Silver},\nyear={2018},\nurl={https://openreview.net/forum?id=r1TA9ZbA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=r1TA9ZbA-", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 8, "corr_rating_confidence": -0.944911182523068, "gs_citation": 106, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2965404279553213316&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "title": "Synthesizing realistic neural population activity patterns using Generative Adversarial Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/143", "id": "r1VVsebAZ", "author_site": "Manuel Molano-Mazon, Arno Onken, Eugenio Piasini, Stefano Panzeri", "tldr": "Using Wasserstein-GANs to generate realistic neural activity and to detect the most relevant features present in neural population patterns.", "abstract": "The ability to synthesize realistic patterns of neural activity is crucial for studying neural information processing. Here we used the Generative Adversarial Networks (GANs) framework to simulate the concerted activity of a population of neurons.\nWe adapted the Wasserstein-GAN variant to facilitate the generation of unconstrained neural population activity patterns while still benefiting from parameter sharing in the temporal domain.\nWe demonstrate that our proposed GAN, which we termed Spike-GAN, generates spike trains that match accurately the first- and second-order statistics of datasets of tens of neurons and also approximates well their higher-order statistics. We applied Spike-GAN to a real dataset recorded from salamander retina and showed that it performs as well as state-of-the-art approaches based on the maximum entropy and the dichotomized Gaussian frameworks. Importantly, Spike-GAN does not require to specify a priori the statistics to be matched by the model, and so constitutes a more flexible method than these alternative approaches.\nFinally, we show how to exploit a trained Spike-GAN to construct 'importance maps' to detect the most relevant statistical structures present in a spike train. \nSpike-GAN provides a powerful, easy-to-use technique for generating realistic spiking neural activity and for describing the most relevant features of the large-scale neural population recordings studied in modern systems neuroscience.\n", "keywords": "GANs;Wasserstein-GANs;convolutional networks;neuroscience;spike train patterns;spike train analysis", "primary_area": "", "supplementary_material": "", "author": "Manuel Molano-Mazon;Arno Onken;Eugenio Piasini*;Stefano Panzeri*", "authorids": "manuel.molano@iit.it;aonken@inf.ed.ac.uk;epiasini@sas.upenn.edu;stefano.panzeri@iit.it", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nmolano-mazon2018synthesizing,\ntitle={Synthesizing realistic neural population activity patterns using Generative Adversarial Networks},\nauthor={Manuel Molano-Mazon and Arno Onken and Eugenio Piasini* and Stefano Panzeri*},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=r1VVsebAZ},\n}", "github": "[![github](/images/github_icon.svg) manuelmolano/Spike-GAN](https://github.com/manuelmolano/Spike-GAN)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "4;6;8", "confidence": "4;3;5", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": 0.5, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3292717005509087968&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=r1VVsebAZ", "pdf": "https://openreview.net/pdf?id=r1VVsebAZ", "email": ";;;", "author_num": 4 }, { "id": "r1YUtYx0-", "title": "Ensemble Robustness and Generalization of Stochastic Deep Learning Algorithms", "track": "main", "status": "Workshop", "tldr": "Explaining the generalization of stochastic deep learning algorithms, theoretically and empirically, via ensemble robustness", "abstract": "The question why deep learning algorithms generalize so well has attracted increasing\nresearch interest. However, most of the well-established approaches,\nsuch as hypothesis capacity, stability or sparseness, have not provided complete\nexplanations (Zhang et al., 2016; Kawaguchi et al., 2017). In this work, we focus\non the robustness approach (Xu & Mannor, 2012), i.e., if the error of a hypothesis\nwill not change much due to perturbations of its training examples, then it\nwill also generalize well. As most deep learning algorithms are stochastic (e.g.,\nStochastic Gradient Descent, Dropout, and Bayes-by-backprop), we revisit the robustness\narguments of Xu & Mannor, and introduce a new approach \u2013 ensemble\nrobustness \u2013 that concerns the robustness of a population of hypotheses. Through\nthe lens of ensemble robustness, we reveal that a stochastic learning algorithm can\ngeneralize well as long as its sensitiveness to adversarial perturbations is bounded\nin average over training examples. Moreover, an algorithm may be sensitive to\nsome adversarial examples (Goodfellow et al., 2015) but still generalize well. To\nsupport our claims, we provide extensive simulations for different deep learning\nalgorithms and different network architectures exhibiting a strong correlation between\nensemble robustness and the ability to generalize.", "keywords": "Robustness;Generalization;Deep Learning;Adversarial Learning", "primary_area": "", "supplementary_material": "", "author": "Tom Zahavy;Bingyi Kang;Alex Sivak;Jiashi Feng;Huan Xu;Shie Mannor", "authorids": "tomzahavy@gmail.com;bingykang@gmail.com;silex@campus.technion.ac.il;jshfeng@gmail.com;huan.xu@isye.gatech.edu;shiemannor@gmail.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nzahavy2018ensemble,\ntitle={Ensemble Robustness and Generalization of Stochastic Deep Learning Algorithms},\nauthor={Tom Zahavy and Bingyi Kang and Alex Sivak and Jiashi Feng and Huan Xu and Shie Mannor},\nyear={2018},\nurl={https://openreview.net/forum?id=r1YUtYx0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1YUtYx0-", "pdf_size": 0, "rating": "4;4;8", "confidence": "5;3;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 14, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5276177564991433700&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "r1YqWz-R-", "title": "Improving Conditional Sequence Generative Adversarial Networks by Stepwise Evaluation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Conditional sequence generation is a widely researched topic. One of the most important tasks is dialogue generation, which is composed of input-output pairs with the one-to-many property. Given the recent success of generative adversarial networks (GANs), GANs have been used for sequence generation. However, there is still limited work of its application on conditional sequence generation. We investigate the influence of GAN on conditional sequence generation with three artificial grammars and dialogue generation. Moreover, we propose stepwise GAN (StepGAN) for conditional sequence generation, which predicts the reward at each time-step. StepGAN can be seen as the general version of SeqGAN. It estimates the expected returns predicted by Monte-Carlo Search in SeqGAN, but it has a lower computational cost than Monte-Carlo Search. Experimental results show that stepwise GAN can outperform other state-of-the-art algorithms in most tasks.", "keywords": "conditional sequence generation;generative adversarial network;REINFORCE;dialogue generation", "primary_area": "", "supplementary_material": "", "author": "Yi-Lin Tuan;Hung-yi Lee", "authorids": "pascaltuan@gmail.com;hungyilee@ntu.edu.tw", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntuan2018improving,\ntitle={Improving Conditional Sequence Generative Adversarial Networks by Stepwise Evaluation},\nauthor={Yi-Lin Tuan and Hung-yi Lee},\nyear={2018},\nurl={https://openreview.net/forum?id=r1YqWz-R-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=r1YqWz-R-", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;4;2", "rating_avg": 4.666666666666667, "confidence_avg": 3.0, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 71, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10890526729663013155&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "title": "Deep Gaussian Embedding of Graphs: Unsupervised Inductive Learning via Ranking", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/188", "id": "r1ZdKJ-0W", "author_site": "Aleksandar Bojchevski, Stephan G\u00fcnnemann", "tldr": " We embed nodes in a graph as Gaussian distributions allowing us to capture uncertainty about their representation.", "abstract": "Methods that learn representations of nodes in a graph play a critical role in network analysis since they enable many downstream learning tasks. We propose Graph2Gauss - an approach that can efficiently learn versatile node embeddings on large scale (attributed) graphs that show strong performance on tasks such as link prediction and node classification. Unlike most approaches that represent nodes as point vectors in a low-dimensional continuous space, we embed each node as a Gaussian distribution, allowing us to capture uncertainty about the representation. Furthermore, we propose an unsupervised method that handles inductive learning scenarios and is applicable to different types of graphs: plain/attributed, directed/undirected. By leveraging both the network structure and the associated node attributes, we are able to generalize to unseen nodes without additional training. To learn the embeddings we adopt a personalized ranking formulation w.r.t. the node distances that exploits the natural ordering of the nodes imposed by the network structure. Experiments on real world networks demonstrate the high performance of our approach, outperforming state-of-the-art network embedding methods on several different tasks. Additionally, we demonstrate the benefits of modeling uncertainty - by analyzing it we can estimate neighborhood diversity and detect the intrinsic latent dimensionality of a graph. ", "keywords": "node embeddings;graphs;unsupervised learning;inductive learning;uncertainty;deep learning", "primary_area": "", "supplementary_material": "", "author": "Aleksandar Bojchevski;Stephan G\u00fcnnemann", "authorids": "a.bojchevski@in.tum.de;guennemann@in.tum.de", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nbojchevski2018deep,\ntitle={Deep Gaussian Embedding of Graphs: Unsupervised Inductive Learning via Ranking},\nauthor={Aleksandar Bojchevski and Stephan G\u00fcnnemann},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=r1ZdKJ-0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;3;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 2, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 885, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10006383742972013439&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=r1ZdKJ-0W", "pdf": "https://openreview.net/pdf?id=r1ZdKJ-0W", "email": ";", "author_num": 2 }, { "id": "r1Zi2Mb0-", "title": "EXPLORING NEURAL ARCHITECTURE SEARCH FOR LANGUAGE TASKS", "track": "main", "status": "Reject", "tldr": "We explore neural architecture search for language tasks. Recurrent cell search is challenging for NMT, but attention mechanism search works. The result of attention search on translation is transferable to reading comprehension.", "abstract": "Neural architecture search (NAS), the task of finding neural architectures automatically, has recently emerged as a promising approach for unveiling better models over human-designed ones. However, most success stories are for vision tasks and have been quite limited for text, except for a small language modeling setup. In this paper, we explore NAS for text sequences at scale, by first focusing on the task of language translation and later extending to reading comprehension. From a standard sequence-to-sequence models for translation, we conduct extensive searches over the recurrent cells and attention similarity functions across two translation tasks, IWSLT English-Vietnamese and WMT German-English. We report challenges in performing cell searches as well as demonstrate initial success on attention searches with translation improvements over strong baselines. In addition, we show that results on attention searches are transferable to reading comprehension on the SQuAD dataset.", "keywords": "Neural architecture search;language tasks;neural machine translation;reading comprehension;SQuAD", "primary_area": "", "supplementary_material": "", "author": "Minh-Thang Luong;David Dohan;Adams Wei Yu;Quoc V. Le;Barret Zoph;Vijay Vasudevan", "authorids": "luong.m.thang@gmail.com;ddohan@google.com;adamsyuwei@gmail.com;qvl@google.com;barretzoph@google.com;vrv@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nluong2018exploring,\ntitle={{EXPLORING} {NEURAL} {ARCHITECTURE} {SEARCH} {FOR} {LANGUAGE} {TASKS}},\nauthor={Minh-Thang Luong and David Dohan and Adams Wei Yu and Quoc V. Le and Barret Zoph and Vijay Vasudevan},\nyear={2018},\nurl={https://openreview.net/forum?id=r1Zi2Mb0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1Zi2Mb0-", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2856603725574680280&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "r1ayG7WRZ", "title": "Don't encrypt the data; just approximate the model \\ Towards Secure Transaction and Fair Pricing of Training Data", "track": "main", "status": "Reject", "tldr": "Facing complex, black-box models, encrypting the data is not as usable as approximating the model and using it to price a potential transaction.", "abstract": "As machine learning becomes ubiquitous, deployed systems need to be as accu- rate as they can. As a result, machine learning service providers have a surging need for useful, additional training data that benefits training, without giving up all the details about the trained program. At the same time, data owners would like to trade their data for its value, without having to first give away the data itself be- fore receiving compensation. It is difficult for data providers and model providers to agree on a fair price without first revealing the data or the trained model to the other side. Escrow systems only complicate this further, adding an additional layer of trust required of both parties. Currently, data owners and model owners don\u2019t have a fair pricing system that eliminates the need to trust a third party and training the model on the data, which 1) takes a long time to complete, 2) does not guarantee that useful data is paid valuably and that useless data isn\u2019t, without trusting in the third party with both the model and the data. Existing improve- ments to secure the transaction focus heavily on encrypting or approximating the data, such as training on encrypted data, and variants of federated learning. As powerful as the methods appear to be, we show them to be impractical in our use case with real world assumptions for preserving privacy for the data owners when facing black-box models. Thus, a fair pricing scheme that does not rely on secure data encryption and obfuscation is needed before the exchange of data. This pa- per proposes a novel method for fair pricing using data-model efficacy techniques such as influence functions, model extraction, and model compression methods, thus enabling secure data transactions. We successfully show that without running the data through the model, one can approximate the value of the data; that is, if the data turns out redundant, the pricing is minimal, and if the data leads to proper improvement, its value is properly assessed, without placing strong assumptions on the nature of the model. Future work will be focused on establishing a system with stronger transactional security against adversarial attacks that will reveal details about the model or the data to the other party.", "keywords": "Applications;Security in Machine Learning;Fairness and Security;Model Compression", "primary_area": "", "supplementary_material": "", "author": "Xinlei Xu", "authorids": "xxu@hmc.edu", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nxu2018dont,\ntitle={Don't encrypt the data; just approximate the model \\ Towards Secure Transaction and Fair Pricing of Training Data},\nauthor={Xinlei Xu},\nyear={2018},\nurl={https://openreview.net/forum?id=r1ayG7WRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=r1ayG7WRZ", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;5;5", "rating_avg": 3.0, "confidence_avg": 4.666666666666667, "replies_avg": 4, "authors#_avg": 1, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FV1FERANNYYJ:scholar.google.com/&scioq=Don%27t+encrypt+the+data%3B+just+approximate+the+model+%5C+Towards+Secure+Transaction+and+Fair+Pricing+of+Training+Data&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "r1cLblgCZ", "title": "Recurrent Auto-Encoder Model for Multidimensional Time Series Representation", "track": "main", "status": "Reject", "tldr": "Using recurrent auto-encoder model to extract multidimensional time series features", "abstract": "Recurrent auto-encoder model can summarise sequential data through an encoder structure into a fixed-length vector and then reconstruct into its original sequential form through the decoder structure. The summarised information can be used to represent time series features. In this paper, we propose relaxing the dimensionality of the decoder output so that it performs partial reconstruction. The fixed-length vector can therefore represent features only in the selected dimensions. In addition, we propose using rolling fixed window approach to generate samples. The change of time series features over time can be summarised as a smooth trajectory path. The fixed-length vectors are further analysed through additional visualisation and unsupervised clustering techniques. \n\nThis proposed method can be applied in large-scale industrial processes for sensors signal analysis purpose where clusters of the vector representations can be used to reflect the operating states of selected aspects of the industrial system.", "keywords": "recurrent autoencoder;seq2seq;rnn;multidimensional time series;clustering;sensor;signal analysis;industrial application", "primary_area": "", "supplementary_material": "", "author": "Timothy Wong;Zhiyuan Luo", "authorids": "timothy.wong@centrica.com;zhiyuan.luo@cs.rhul.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nwong2018recurrent,\ntitle={Recurrent Auto-Encoder Model for Multidimensional Time Series Representation},\nauthor={Timothy Wong and Zhiyuan Luo},\nyear={2018},\nurl={https://openreview.net/forum?id=r1cLblgCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1cLblgCZ", "pdf_size": 0, "rating": "2;2;4", "confidence": "5;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3702333096809751619&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Natural Language Inference over Interaction Space", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/334", "id": "r1dHXnH6-", "author_site": "Yichen Gong, Heng Luo, Jian Zhang", "tldr": "show multi-channel attention weight contains semantic feature to solve natural language inference task.", "abstract": "Natural Language Inference (NLI) task requires an agent to determine the logical relationship between a natural language premise and a natural language hypothesis. We introduce Interactive Inference Network (IIN), a novel class of neural network architectures that is able to achieve high-level understanding of the sentence pair by hierarchically extracting semantic features from interaction space. We show that an interaction tensor (attention weight) contains semantic information to solve natural language inference, and a denser interaction tensor contains richer semantic information. One instance of such architecture, Densely Interactive Inference Network (DIIN), demonstrates the state-of-the-art performance on large scale NLI copora and large-scale NLI alike corpus. It's noteworthy that DIIN achieve a greater than 20% error reduction on the challenging Multi-Genre NLI (MultiNLI) dataset with respect to the strongest published system.", "keywords": "natural language inference;attention;SoTA;natural language understanding", "primary_area": "", "supplementary_material": "", "author": "Yichen Gong;Heng Luo;Jian Zhang", "authorids": "yichen.gong@nyu.edu;heng.luo@hobot.cc;jian.zhang@hobot.cc", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ngong2018natural,\ntitle={Natural Language Inference over Interaction Space},\nauthor={Yichen Gong and Heng Luo and Jian Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=r1dHXnH6-},\n}", "github": "[![github](/images/github_icon.svg) YichenGong/Densely-Interactive-Inference-Network](https://github.com/YichenGong/Densely-Interactive-Inference-Network) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=r1dHXnH6-)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;6;6", "confidence": "5;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 347, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3763530184208671433&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=r1dHXnH6-", "pdf": "https://openreview.net/pdf?id=r1dHXnH6-", "email": ";;", "author_num": 3 }, { "id": "r1drp-WCZ", "title": "State Space LSTM Models with Particle MCMC Inference", "track": "main", "status": "Reject", "tldr": "We present State Space LSTM models, a combination of state space models and LSTMs, and propose an inference algorithm based on sequential Monte Carlo. ", "abstract": "Long Short-Term Memory (LSTM) is one of the most powerful sequence models. Despite the strong performance, however, it lacks the nice interpretability as in state space models. In this paper, we present a way to combine the best of both worlds by introducing State Space LSTM (SSL), which generalizes the earlier work \\cite{zaheer2017latent} of combining topic models with LSTM. However, unlike \\cite{zaheer2017latent}, we do not make any factorization assumptions in our inference algorithm. We present an efficient sampler based on sequential Monte Carlo (SMC) method that draws from the joint posterior directly. Experimental results confirms the superiority and stability of this SMC inference algorithm on a variety of domains.", "keywords": "recurrent neural networks;state space models;sequential Monte Carlo", "primary_area": "", "supplementary_material": "", "author": "Xun Zheng;Manzil Zaheer;Amr Ahmed;Yuan Wang;Eric P. Xing;Alex Smola", "authorids": "xunzheng90@gmail.com;manzil@cmu.edu;amra@google.com;yuanwang@google.com;epxing@cs.cmu.edu;alex@smola.org", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nzheng2018state,\ntitle={State Space {LSTM} Models with Particle {MCMC} Inference},\nauthor={Xun Zheng and Manzil Zaheer and Amr Ahmed and Yuan Wang and Eric P. Xing and Alex Smola},\nyear={2018},\nurl={https://openreview.net/forum?id=r1drp-WCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1drp-WCZ", "pdf_size": 0, "rating": "3;5;7", "confidence": "5;5;5", "rating_avg": 5.0, "confidence_avg": 5.0, "replies_avg": 10, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 66, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4674994109106438958&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "title": "Mixed Precision Training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/288", "id": "r1gs9JgRZ", "author_site": "Paulius Micikevicius, SHARAN NARANG, Jonah Alben, Gregory Diamos, Erich K Elsen, David Garcia, Boris Ginsburg, Michael Houston, Oleksii Kuchaiev, Ganesh Venkatesh, Hao Wu", "tldr": "", "abstract": "Increasing the size of a neural network typically improves accuracy but also increases the memory and compute requirements for training the model. We introduce methodology for training deep neural networks using half-precision floating point numbers, without losing model accuracy or having to modify hyper-parameters. This nearly halves memory requirements and, on recent GPUs, speeds up arithmetic. Weights, activations, and gradients are stored in IEEE half-precision format. Since this format has a narrower range than single-precision we propose three techniques for preventing the loss of critical information. Firstly, we recommend maintaining a single-precision copy of weights that accumulates the gradients after each optimizer step (this copy is rounded to half-precision for the forward- and back-propagation). Secondly, we propose loss-scaling to preserve gradient values with small magnitudes. Thirdly, we use half-precision arithmetic that accumulates into single-precision outputs, which are converted to half-precision before storing to memory. We demonstrate that the proposed methodology works across a wide variety of tasks and modern large scale (exceeding 100 million parameters) model architectures, trained on large datasets.", "keywords": "Half precision;float16;Convolutional neural networks;Recurrent neural networks", "primary_area": "", "supplementary_material": "", "author": "Paulius Micikevicius;Sharan Narang;Jonah Alben;Gregory Diamos;Erich Elsen;David Garcia;Boris Ginsburg;Michael Houston;Oleksii Kuchaiev;Ganesh Venkatesh;Hao Wu", "authorids": "pauliusm@nvidia.com;sharan@baidu.com;alben@nvidia.com;gdiamos@baidu.com;eriche@google.com;dagarcia@nvidia.com;bginsburg@nvidia.com;mhouston@nvidia.com;okuchaiev@nvidia.com;gavenkatesh@nvidia.com;skyw@nvidia.com", "gender": ";;;;;;;;;;", "homepage": ";;;;;;;;;;", "dblp": ";;;;;;;;;;", "google_scholar": ";;;;;;;;;;", "orcid": ";;;;;;;;;;", "linkedin": ";;;;;;;;;;", "or_profile": ";;;;;;;;;;", "aff": ";;;;;;;;;;", "aff_domain": ";;;;;;;;;;", "position": ";;;;;;;;;;", "bibtex": "@inproceedings{\nmicikevicius2018mixed,\ntitle={Mixed Precision Training},\nauthor={Paulius Micikevicius and Sharan Narang and Jonah Alben and Gregory Diamos and Erich Elsen and David Garcia and Boris Ginsburg and Michael Houston and Oleksii Kuchaiev and Ganesh Venkatesh and Hao Wu},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=r1gs9JgRZ},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 8 community implementations](https://paperswithcode.com/paper/?openreview=r1gs9JgRZ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;7;8", "confidence": "3;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 11, "corr_rating_confidence": 0.9449111825230683, "gs_citation": 2212, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18172749567892275222&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=r1gs9JgRZ", "pdf": "https://openreview.net/pdf?id=r1gs9JgRZ", "email": ";;;;;;;;;;", "author_num": 11 }, { "id": "r1h2DllAW", "title": "Discrete-Valued Neural Networks Using Variational Inference", "track": "main", "status": "Reject", "tldr": "Variational Inference for infering a discrete distribution from which a low-precision neural network is derived", "abstract": "The increasing demand for neural networks (NNs) being employed on embedded devices has led to plenty of research investigating methods for training low precision NNs. While most methods involve a quantization step, we propose a principled Bayesian approach where we first infer a distribution over a discrete weight space from which we subsequently derive hardware-friendly low precision NNs. To this end, we introduce a probabilistic forward pass to approximate the intractable variational objective that allows us to optimize over discrete-valued weight distributions for NNs with sign activation functions. In our experiments, we show that our model achieves state of the art performance on several real world data sets. In addition, the resulting models exhibit a substantial amount of sparsity that can be utilized to further reduce the computational costs for inference.", "keywords": "low-precision;neural networks;resource efficient;variational inference;Bayesian", "primary_area": "", "supplementary_material": "", "author": "Wolfgang Roth;Franz Pernkopf", "authorids": "roth@tugraz.at;pernkopf@tugraz.at", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nroth2018discretevalued,\ntitle={Discrete-Valued Neural Networks Using Variational Inference},\nauthor={Wolfgang Roth and Franz Pernkopf},\nyear={2018},\nurl={https://openreview.net/forum?id=r1h2DllAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=r1h2DllAW", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;1", "rating_avg": 5.333333333333333, "confidence_avg": 3.0, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999999, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13210408398709308872&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "r1hsJCe0Z", "title": "Semantic Code Repair using Neuro-Symbolic Transformation Networks", "track": "main", "status": "Workshop", "tldr": "A neural architecture for scoring and ranking program repair candidates to perform semantic program repair statically without access to unit tests.", "abstract": "We study the problem of semantic code repair, which can be broadly defined as automatically fixing non-syntactic bugs in source code. The majority of past work in semantic code repair assumed access to unit tests against which candidate repairs could be validated. In contrast, the goal here is to develop a strong statistical model to accurately predict both bug locations and exact fixes without access to information about the intended correct behavior of the program. Achieving such a goal requires a robust contextual repair model, which we train on a large corpus of real-world source code that has been augmented with synthetically injected bugs. Our framework adopts a two-stage approach where first a large set of repair candidates are generated by rule-based processors, and then these candidates are scored by a statistical model using a novel neural network architecture which we refer to as Share, Specialize, and Compete. Specifically, the architecture (1) generates a shared encoding of the source code using an RNN over the abstract syntax tree, (2) scores each candidate repair using specialized network modules, and (3) then normalizes these scores together so they can compete against one another in comparable probability space. We evaluate our model on a real-world test set gathered from GitHub containing four common categories of bugs. Our model is able to predict the exact correct repair 41% of the time with a single guess, compared to 13% accuracy for an attentional sequence-to-sequence model.", "keywords": "semantic program repair;neural program embeddings;deep learning", "primary_area": "", "supplementary_material": "", "author": "Jacob Devlin;Jonathan Uesato;Rishabh Singh;Pushmeet Kohli", "authorids": "jacobdevlin@google.com;juesato@gmail.com;risin@microsoft.com;pushmeet@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ndevlin2018semantic,\ntitle={Semantic Code Repair using Neuro-Symbolic Transformation Networks},\nauthor={Jacob Devlin and Jonathan Uesato and Rishabh Singh and Pushmeet Kohli},\nyear={2018},\nurl={https://openreview.net/forum?id=r1hsJCe0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1hsJCe0Z", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14522043052219246742&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "On the importance of single directions for generalization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/232", "id": "r1iuQjxCZ", "author_site": "Ari Morcos, David Barrett, Neil C Rabinowitz, Matthew Botvinick", "tldr": "We find that deep networks which generalize poorly are more reliant on single directions than those that generalize well, and evaluate the impact of dropout and batch normalization, as well as class selectivity on single direction reliance.", "abstract": "Despite their ability to memorize large datasets, deep neural networks often achieve good generalization performance. However, the differences between the learned solutions of networks which generalize and those which do not remain unclear. Additionally, the tuning properties of single directions (defined as the activation of a single unit or some linear combination of units in response to some input) have been highlighted, but their importance has not been evaluated. Here, we connect these lines of inquiry to demonstrate that a network\u2019s reliance on single directions is a good predictor of its generalization performance, across networks trained on datasets with different fractions of corrupted labels, across ensembles of networks trained on datasets with unmodified labels, across different hyper- parameters, and over the course of training. While dropout only regularizes this quantity up to a point, batch normalization implicitly discourages single direction reliance, in part by decreasing the class selectivity of individual units. Finally, we find that class selectivity is a poor predictor of task importance, suggesting not only that networks which generalize well minimize their dependence on individual units by reducing their selectivity, but also that individually selective units may not be necessary for strong network performance.", "keywords": "generalization;analysis;deep learning;selectivity", "primary_area": "", "supplementary_material": "", "author": "Ari S. Morcos;David G.T. Barrett;Neil C. Rabinowitz;Matthew Botvinick", "authorids": "arimorcos@google.com;barrettdavid@google.com;ncr@google.com;botvinick@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ns.2018on,\ntitle={On the importance of single directions for generalization},\nauthor={Ari S. Morcos and David G.T. Barrett and Neil C. Rabinowitz and Matthew Botvinick},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=r1iuQjxCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "5;7;9", "confidence": "4;3;3", "rating_avg": 7.0, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 388, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3723864942652776777&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=r1iuQjxCZ", "pdf": "https://openreview.net/pdf?id=r1iuQjxCZ", "email": ";;;", "author_num": 4 }, { "id": "r1kNDlbCb", "title": "Learning to Encode Text as Human-Readable Summaries using Generative Adversarial Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Auto-encoders compress input data into a latent-space representation and reconstruct the original data from the representation. This latent representation is not easily interpreted by humans. In this paper, we propose training an auto-encoder that encodes input text into human-readable sentences. The auto-encoder is composed of a generator and a reconstructor. The generator encodes the input text into a shorter word sequence, and the reconstructor recovers the generator input from the generator output.\nTo make the generator output human-readable, a discriminator restricts the output of the generator to resemble human-written sentences. By taking the generator output as the summary of the input text, abstractive summarization is achieved without document-summary pairs as training data. Promising results are shown on both English and Chinese corpora.", "keywords": "unsupervised learning;text summarization;adversarial training", "primary_area": "", "supplementary_material": "", "author": "Yau-Shian Wang;Hung-Yi Lee", "authorids": "king6101@gmail.com;tlkagkb93901106@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nwang2018learning,\ntitle={Learning to Encode Text as Human-Readable Summaries using Generative Adversarial Networks},\nauthor={Yau-Shian Wang and Hung-Yi Lee},\nyear={2018},\nurl={https://openreview.net/forum?id=r1kNDlbCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1kNDlbCb", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12720315531190663355&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "r1kP7vlRb", "title": "Toward learning better metrics for sequence generation training with policy gradient", "track": "main", "status": "Reject", "tldr": "This paper aims to learn a better metric for unsupervised learning, such as text generation, and shows a significant improvement over SeqGAN.", "abstract": "Designing a metric manually for unsupervised sequence generation tasks, such as text generation, is essentially difficult. In a such situation, learning a metric of a sequence from data is one possible solution. The previous study, SeqGAN, proposed the framework for unsupervised sequence generation, in which a metric is learned from data, and a generator is optimized with regard to the learned metric with policy gradient, inspired by generative adversarial nets (GANs) and reinforcement learning. In this paper, we make two proposals to learn better metric than SeqGAN's: partial reward function and expert-based reward function training. The partial reward function is a reward function for a partial sequence of a certain length. SeqGAN employs a reward function for completed sequence only. By combining long-scale and short-scale partial reward functions, we expect a learned metric to be able to evaluate a partial correctness as well as a coherence of a sequence, as a whole. In expert-based reward function training, a reward function is trained to discriminate between an expert (or true) sequence and a fake sequence that is produced by editing an expert sequence. Expert-based reward function training is not a kind of GAN frameworks. This makes the optimization of the generator easier. We examine the effect of the partial reward function and expert-based reward function training on synthetic data and real text data, and show improvements over SeqGAN and the model trained with MLE. Specifically, whereas SeqGAN gains 0.42 improvement of NLL over MLE on synthetic data, our best model gains 3.02 improvement, and whereas SeqGAN gains 0.029 improvement of BLEU over MLE, our best model gains 0.250 improvement.", "keywords": "sequence generation;reinforcement learning;unsupervised learning;RNN", "primary_area": "", "supplementary_material": "", "author": "Joji Toyama;Yusuke Iwasawa;Kotaro Nakayama;Yutaka Matsuo", "authorids": "toyama@weblab.t.u-tokyo.ac.jp;iwasawa@weblab.t.u-tokyo.ac.jp;nakayama@weblab.t.u-tokyo.ac.jp;matsuo@weblab.t.u-tokyo.ac.jp", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ntoyama2018toward,\ntitle={Toward learning better metrics for sequence generation training with policy gradient},\nauthor={Joji Toyama and Yusuke Iwasawa and Kotaro Nakayama and Yutaka Matsuo},\nyear={2018},\nurl={https://openreview.net/forum?id=r1kP7vlRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1kP7vlRb", "pdf_size": 0, "rating": "4;7;7", "confidence": "3;1;3", "rating_avg": 6.0, "confidence_avg": 2.3333333333333335, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": -0.5000000000000001, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12024524226931870179&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "r1kj4ACp-", "title": "Understanding Deep Learning Generalization by Maximum Entropy", "track": "main", "status": "Reject", "tldr": "We prove that DNN is a recursively approximated solution to the maximum entropy principle.", "abstract": "Deep learning achieves remarkable generalization capability with overwhelming number of model parameters. Theoretical understanding of deep learning generalization receives recent attention yet remains not fully explored. This paper attempts to provide an alternative understanding from the perspective of maximum entropy. We first derive two feature conditions that softmax regression strictly apply maximum entropy principle. DNN is then regarded as approximating the feature conditions with multilayer feature learning, and proved to be a recursive solution towards maximum entropy principle. The connection between DNN and maximum entropy well explains why typical designs such as shortcut and regularization improves model generalization, and provides instructions for future model development.", "keywords": "generalization;maximum entropy;deep learning", "primary_area": "", "supplementary_material": "", "author": "Guanhua Zheng;Jitao Sang;Changsheng Xu", "authorids": "zhenggh@mail.ustc.edu.cn;jtsang@bjtu.edu.cn;csxu@nlpr.ia.ac.cn", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzheng2018understanding,\ntitle={Understanding Deep Learning Generalization by Maximum Entropy},\nauthor={Guanhua Zheng and Jitao Sang and Changsheng Xu},\nyear={2018},\nurl={https://openreview.net/forum?id=r1kj4ACp-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1kj4ACp-", "pdf_size": 0, "rating": "2;3;6", "confidence": "3;3;2", "rating_avg": 3.6666666666666665, "confidence_avg": 2.6666666666666665, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": -0.9707253433941508, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=981895521378912368&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "r1kjEuHpZ", "title": "Learning Less-Overlapping Representations", "track": "main", "status": "Reject", "tldr": "We propose a new type of regularization approach that encourages non-overlapness in representation learning, for the sake of improving interpretability and reducing overfitting.", "abstract": "In representation learning (RL), how to make the learned representations easy to interpret and less overfitted to training data are two important but challenging issues. To address these problems, we study a new type of regularization approach that encourages the supports of weight vectors in RL models to have small overlap, by simultaneously promoting near-orthogonality among vectors and sparsity of each vector. We apply the proposed regularizer to two models: neural networks (NNs) and sparse coding (SC), and develop an efficient ADMM-based algorithm for regularized SC. Experiments on various datasets demonstrate that weight vectors learned under our regularizer are more interpretable and have better generalization performance.", "keywords": "Less-overlapness;regularization;near-orthogonality;sparsity", "primary_area": "", "supplementary_material": "", "author": "Hongbao Zhang;Pengtao Xie;Eric Xing", "authorids": "hongbao.zhang@petuum.com;pengtaox@cs.cmu.edu;eric.xing@petuum.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzhang2018learning,\ntitle={Learning Less-Overlapping Representations},\nauthor={Hongbao Zhang and Pengtao Xie and Eric Xing},\nyear={2018},\nurl={https://openreview.net/forum?id=r1kjEuHpZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=r1kjEuHpZ", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;4;4", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6013448569628103362&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "title": "Kernel Implicit Variational Inference", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/11", "id": "r1l4eQW0Z", "author_site": "Jiaxin Shi, Shengyang Sun, Jun Zhu", "tldr": "", "abstract": "Recent progress in variational inference has paid much attention to the flexibility of variational posteriors. One promising direction is to use implicit distributions, i.e., distributions without tractable densities as the variational posterior. However, existing methods on implicit posteriors still face challenges of noisy estimation and computational infeasibility when applied to models with high-dimensional latent variables. In this paper, we present a new approach named Kernel Implicit Variational Inference that addresses these challenges. As far as we know, for the first time implicit variational inference is successfully applied to Bayesian neural networks, which shows promising results on both regression and classification tasks.", "keywords": "Variational inference;Bayesian neural networks;Implicit distribution", "primary_area": "", "supplementary_material": "", "author": "Jiaxin Shi;Shengyang Sun;Jun Zhu", "authorids": "shijx15@mails.tsinghua.edu.cn;ssy@cs.toronto.edu;dcszj@tsinghua.edu.cn", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nshi2018kernel,\ntitle={Kernel Implicit Variational Inference},\nauthor={Jiaxin Shi and Shengyang Sun and Jun Zhu},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=r1l4eQW0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;7;7", "confidence": "4;4;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 67, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12164297985186299084&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=r1l4eQW0Z", "pdf": "https://openreview.net/pdf?id=r1l4eQW0Z", "email": ";;", "author_num": 3 }, { "title": "Demystifying MMD GANs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/54", "id": "r1lUOzWCW", "author_site": "Mikolaj Binkowski, [deadname] Sutherland, Michael Arbel, Arthur Gretton", "tldr": "Explain bias situation with MMD GANs; MMD GANs work with smaller critic networks than WGAN-GPs; new GAN evaluation metric.", "abstract": "We investigate the training and performance of generative adversarial networks using the Maximum Mean Discrepancy (MMD) as critic, termed MMD GANs. As our main theoretical contribution, we clarify the situation with bias in GAN loss functions raised by recent work: we show that gradient estimators used in the optimization process for both MMD GANs and Wasserstein GANs are unbiased, but learning a discriminator based on samples leads to biased gradients for the generator parameters. We also discuss the issue of kernel choice for the MMD critic, and characterize the kernel corresponding to the energy distance used for the Cram\u00e9r GAN critic. Being an integral probability metric, the MMD benefits from training strategies recently developed for Wasserstein GANs. In experiments, the MMD GAN is able to employ a smaller critic network than the Wasserstein GAN, resulting in a simpler and faster-training algorithm with matching performance. We also propose an improved measure of GAN convergence, the Kernel Inception Distance, and show how to use it to dynamically adapt learning rates during GAN training.", "keywords": "gans;mmd;ipms;wgan;gradient penalty;unbiased gradients", "primary_area": "", "supplementary_material": "", "author": "Miko\u0142aj Bi\u0144kowski;Danica J. Sutherland;Michael Arbel;Arthur Gretton", "authorids": "mikbinkowski@gmail.com;dsuth@cs.ubc.ca;michael.n.arbel@gmail.com;arthur.gretton@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nbi\u0144kowski2018demystifying,\ntitle={Demystifying {MMD} {GAN}s},\nauthor={Miko\u0142aj Bi\u0144kowski and Dougal J. Sutherland and Michael Arbel and Arthur Gretton},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=r1lUOzWCW},\n}", "github": "[![github](/images/github_icon.svg) mbinkowski/MMD-GAN](https://github.com/mbinkowski/MMD-GAN) + [![Papers with Code](/images/pwc_icon.svg) 6 community implementations](https://paperswithcode.com/paper/?openreview=r1lUOzWCW)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;2;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": -0.1889822365046137, "gs_citation": 1874, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10236052458128513824&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=r1lUOzWCW", "pdf": "https://openreview.net/pdf?id=r1lUOzWCW", "email": ";;;", "author_num": 4 }, { "id": "r1lfpfZAb", "title": "Learning to Write by Learning the Objective", "track": "main", "status": "Workshop", "tldr": "We build a stronger natural language generator by discriminatively training scoring functions that rank candidate generations with respect to various qualities of good writing.", "abstract": "Recurrent Neural Networks (RNNs) are powerful autoregressive sequence models for learning prevalent patterns in natural language. Yet language generated by RNNs often shows several degenerate characteristics that are uncommon in human language; while fluent, RNN language production can be overly generic, repetitive, and even self-contradictory. We postulate that the objective function optimized by RNN language models, which amounts to the overall perplexity of a text, is not expressive enough to capture the abstract qualities of good generation such as Grice\u2019s Maxims. In this paper, we introduce a general learning framework that can construct a decoding objective better suited for generation. Starting with a generatively trained RNN language model, our framework learns to construct a substantially stronger generator by combining several discriminatively trained models that can collectively address the limitations of RNN generation. Human evaluation demonstrates that text generated by the resulting generator is preferred over that of baselines by a large margin and significantly enhances the overall coherence, style, and information content of the generated text.", "keywords": "natural language generation", "primary_area": "", "supplementary_material": "", "author": "Ari Holtzman;Jan Buys;Maxwell Forbes;Antoine Bosselut;Yejin Choi", "authorids": "ahai@cs.washington.edu;jbuys@cs.washington.edu;mbforbes@cs.washington.edu;antoineb@cs.washington.edu;yejin@cs.washington.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nholtzman2018learning,\ntitle={Learning to Write by Learning the Objective},\nauthor={Ari Holtzman and Jan Buys and Maxwell Forbes and Antoine Bosselut and Yejin Choi},\nyear={2018},\nurl={https://openreview.net/forum?id=r1lfpfZAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=r1lfpfZAb", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;4;5", "rating_avg": 5.0, "confidence_avg": 4.666666666666667, "replies_avg": 8, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "r1nmx5l0W", "title": "SIC-GAN: A Self-Improving Collaborative GAN for Decoding Sketch RNNs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Variational RNNs are proposed to output \u201ccreative\u201d sequences. Ideally, a collection of sequences produced by a variational RNN should be of both high quality and high variety. However, existing decoders for variational RNNs suffer from a trade-off between quality and variety. In this paper, we seek to learn a variational RNN that decodes high-quality and high-variety sequences. We propose the Self-Improving Collaborative GAN (SIC-GAN), where there are two generators (variational RNNs) collaborating with each other to output a sequence and aiming to trick the discriminator into believing the sequence is of good quality. By deliberately weakening one generator, we can make another stronger in balancing quality and variety. We conduct experiments using the QuickDraw dataset and the results demonstrate the effectiveness of SIC-GAN empirically. ", "keywords": "RNNs;GANs;Variational RNNs;Sketch RNNs", "primary_area": "", "supplementary_material": "", "author": "Chi-Chun Chuang;Zheng-Xin Weng;Shan-Hung Wu", "authorids": "ccchuang@datalab.cs.nthu.edu.tw;zxweng@datalab.cs.nthu.edu.tw;shwu@cs.nthu.edu.tw", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nchuang2018sicgan,\ntitle={{SIC}-{GAN}: A Self-Improving Collaborative {GAN} for Decoding Sketch {RNN}s},\nauthor={Chi-Chun Chuang and Zheng-Xin Weng and Shan-Hung Wu},\nyear={2018},\nurl={https://openreview.net/forum?id=r1nmx5l0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=r1nmx5l0W", "pdf_size": 0, "rating": "4;5;7", "confidence": "5;3;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": -0.7559289460184546, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:IcYO4GGeZEEJ:scholar.google.com/&scioq=SIC-GAN:+A+Self-Improving+Collaborative+GAN+for+Decoding+Sketch+RNNs&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "r1nzLmWAb", "title": "Video Action Segmentation with Hybrid Temporal Networks", "track": "main", "status": "Reject", "tldr": "We propose a new hybrid temporal network that achieves state-of-the-art performance on video action segmentation on three public datasets.", "abstract": "Action segmentation as a milestone towards building automatic systems to understand untrimmed videos has received considerable attention in the recent years. It is typically being modeled as a sequence labeling problem but contains intrinsic and sufficient differences than text parsing or speech processing. In this paper, we introduce a novel hybrid temporal convolutional and recurrent network (TricorNet), which has an encoder-decoder architecture: the encoder consists of a hierarchy of temporal convolutional kernels that capture the local motion changes of different actions; the decoder is a hierarchy of recurrent neural networks that are able to learn and memorize long-term action dependencies after the encoding stage. Our model is simple but extremely effective in terms of video sequence labeling. The experimental results on three public action segmentation datasets have shown that the proposed model achieves superior performance over the state of the art.", "keywords": "action segmentation;video labeling;temporal networks", "primary_area": "", "supplementary_material": "", "author": "Li Ding;Chenliang Xu", "authorids": "liding@rochester.edu;chenliang.xu@rochester.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nding2018video,\ntitle={Video Action Segmentation with Hybrid Temporal Networks},\nauthor={Li Ding and Chenliang Xu},\nyear={2018},\nurl={https://openreview.net/forum?id=r1nzLmWAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1nzLmWAb", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;5;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.666666666666667, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4VtH3XPOIOwJ:scholar.google.com/&scioq=Video+Action+Segmentation+with+Hybrid+Temporal+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "r1pW0WZAW", "title": "Analyzing and Exploiting NARX Recurrent Neural Networks for Long-Term Dependencies", "track": "main", "status": "Workshop", "tldr": "We introduce MIST RNNs, which a) exhibit superior vanishing-gradient properties in comparison to LSTM; b) improve performance substantially over LSTM and Clockwork RNNs on tasks requiring very long-term dependencies; and c) are much more efficient than previously-proposed NARX RNNs, with even fewer parameters and operations than LSTM.", "abstract": "Recurrent neural networks (RNNs) have achieved state-of-the-art performance on many diverse tasks, from machine translation to surgical activity recognition, yet training RNNs to capture long-term dependencies remains difficult. To date, the vast majority of successful RNN architectures alleviate this problem using nearly-additive connections between states, as introduced by long short-term memory (LSTM). We take an orthogonal approach and introduce MIST RNNs, a NARX RNN architecture that allows direct connections from the very distant past. We show that MIST RNNs 1) exhibit superior vanishing-gradient properties in comparison to LSTM and previously-proposed NARX RNNs; 2) are far more efficient than previously-proposed NARX RNN architectures, requiring even fewer computations than LSTM; and 3) improve performance substantially over LSTM and Clockwork RNNs on tasks requiring very long-term dependencies.", "keywords": "recurrent neural networks;long-term dependencies;long short-term memory;LSTM", "primary_area": "", "supplementary_material": "", "author": "Robert DiPietro;Christian Rupprecht;Nassir Navab;Gregory D. Hager", "authorids": "rdipietro@gmail.com;christian.rupprecht@in.tum.de;nassir.navab@tum.de;hager@cs.jhu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ndipietro2018analyzing,\ntitle={Analyzing and Exploiting {NARX} Recurrent Neural Networks for Long-Term Dependencies},\nauthor={Robert DiPietro and Christian Rupprecht and Nassir Navab and Gregory D. Hager},\nyear={2018},\nurl={https://openreview.net/forum?id=r1pW0WZAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=r1pW0WZAW", "pdf_size": 0, "rating": "3;6;7", "confidence": "4;4;5", "rating_avg": 5.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.6933752452815364, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11151885715547948232&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "title": "The Implicit Bias of Gradient Descent on Separable Data", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/236", "id": "r1q7n9gAb", "author_site": "Daniel Soudry, Elad Hoffer, Mor Shpigel Nacson, Nathan Srebro", "tldr": "The normalized solution of gradient descent on logistic regression (or a similarly decaying loss) slowly converges to the L2 max margin solution on separable data.", "abstract": "We show that gradient descent on an unregularized logistic regression\nproblem, for almost all separable datasets, converges to the same direction as the max-margin solution. The result generalizes also to other monotone decreasing loss functions with an infimum at infinity, and we also discuss a multi-class generalizations to the cross entropy loss. Furthermore,\nwe show this convergence is very slow, and only logarithmic in the\nconvergence of the loss itself. This can help explain the benefit\nof continuing to optimize the logistic or cross-entropy loss even\nafter the training error is zero and the training loss is extremely\nsmall, and, as we show, even if the validation loss increases. Our\nmethodology can also aid in understanding implicit regularization\nin more complex models and with other optimization methods. ", "keywords": "gradient descent;implicit regularization;generalization;margin;logistic regression;loss functions;optimization;exponential tail;cross-entropy", "primary_area": "", "supplementary_material": "", "author": "Daniel Soudry;Elad Hoffer;Mor Shpigel Nacson;Nathan Srebro", "authorids": "daniel.soudry@gmail.com;elad.hoffer@gmail.com;mor.shpigel@gmail.com;nati@ttic.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nsoudry2018the,\ntitle={The Implicit Bias of Gradient Descent on Separable Data},\nauthor={Daniel Soudry and Elad Hoffer and Nathan Srebro},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=r1q7n9gAb},\n}", "github": "[![github](/images/github_icon.svg) paper-submissions/MaxMargin](https://github.com/paper-submissions/MaxMargin) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=r1q7n9gAb)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "5;7;8", "confidence": "5;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": -0.944911182523068, "gs_citation": 1096, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8363232294125339657&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=r1q7n9gAb", "pdf": "https://openreview.net/pdf?id=r1q7n9gAb", "email": ";;;", "author_num": 4 }, { "id": "r1saNM-RW", "title": "Small Coresets to Represent Large Training Data for Support Vector Machines", "track": "main", "status": "Reject", "tldr": "We present an algorithm for speeding up SVM training on massive data sets by constructing compact representations that provide efficient and provably approximate inference.", "abstract": "Support Vector Machines (SVMs) are one of the most popular algorithms for classification and regression analysis. Despite their popularity, even efficient implementations have proven to be computationally expensive to train at a large-scale, especially in streaming settings. In this paper, we propose a novel coreset construction algorithm for efficiently generating compact representations of massive data sets to speed up SVM training. A coreset is a weighted subset of the original data points such that SVMs trained on the coreset are provably competitive with those trained on the original (massive) data set. We provide both lower and upper bounds on the number of samples required to obtain accurate approximations to the SVM problem as a function of the complexity of the input data. Our analysis also establishes sufficient conditions on the existence of sufficiently compact and representative coresets for the SVM problem. We empirically evaluate the practical effectiveness of our algorithm against synthetic and real-world data sets.", "keywords": "coresets;data compression", "primary_area": "", "supplementary_material": "", "author": "Cenk Baykal;Murad Tukan;Dan Feldman;Daniela Rus", "authorids": "baykal@mit.edu;muradtuk@gmail.com;dannyf.post@gmail.com;rus@csail.mit.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbaykal2018small,\ntitle={Small Coresets to Represent Large Training Data for Support Vector Machines},\nauthor={Cenk Baykal and Murad Tukan and Dan Feldman and Daniela Rus},\nyear={2018},\nurl={https://openreview.net/forum?id=r1saNM-RW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1saNM-RW", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;3;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11771203762064618990&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "r1tJKuyRZ", "title": "The Set Autoencoder: Unsupervised Representation Learning for Sets", "track": "main", "status": "Reject", "tldr": "We propose the set autoencoder, a model for unsupervised representation learning for sets of elements.", "abstract": "We propose the set autoencoder, a model for unsupervised representation learning for sets of elements. It is closely related to sequence-to-sequence models, which learn fixed-sized latent representations for sequences, and have been applied to a number of challenging supervised sequence tasks such as machine translation, as well as unsupervised representation learning for sequences.\nIn contrast to sequences, sets are permutation invariant. The proposed set autoencoder considers this fact, both with respect to the input as well as the output of the model. On the input side, we adapt a recently-introduced recurrent neural architecture using a content-based attention mechanism. On the output side, we use a stable marriage algorithm to align predictions to labels in the learning phase.\nWe train the model on synthetic data sets of point clouds and show that the learned representations change smoothly with translations in the inputs, preserve distances in the inputs, and that the set size is represented directly. We apply the model to supervised tasks on the point clouds using the fixed-size latent representation. For a number of difficult classification problems, the results are better than those of a model that does not consider the permutation invariance. Especially for small training sets, the set-aware model benefits from unsupervised pretraining.", "keywords": "set;unsupervised learning;representation learning", "primary_area": "", "supplementary_material": "", "author": "Malte Probst", "authorids": "malte.probst@honda-ri.de", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nprobst2018the,\ntitle={The Set Autoencoder: Unsupervised Representation Learning for Sets},\nauthor={Malte Probst},\nyear={2018},\nurl={https://openreview.net/forum?id=r1tJKuyRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1tJKuyRZ", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 6, "authors#_avg": 1, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12490543009274216937&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "r1uOhfb0W", "title": "Learning Sparse Structured Ensembles with SG-MCMC and Network Pruning", "track": "main", "status": "Reject", "tldr": "Propose a novel method by integrating SG-MCMC sampling, group sparse prior and network pruning to learn Sparse Structured Ensemble (SSE) with improved performance and significantly reduced cost than traditional methods. ", "abstract": "An ensemble of neural networks is known to be more robust and accurate than an individual network, however usually with linearly-increased cost in both training and testing. \nIn this work, we propose a two-stage method to learn Sparse Structured Ensembles (SSEs) for neural networks.\nIn the first stage, we run SG-MCMC with group sparse priors to draw an ensemble of samples from the posterior distribution of network parameters. In the second stage, we apply weight-pruning to each sampled network and then perform retraining over the remained connections.\nIn this way of learning SSEs with SG-MCMC and pruning, we not only achieve high prediction accuracy since SG-MCMC enhances exploration of the model-parameter space, but also reduce memory and computation cost significantly in both training and testing of NN ensembles.\nThis is thoroughly evaluated in the experiments of learning SSE ensembles of both FNNs and LSTMs.\nFor example, in LSTM based language modeling (LM), we obtain 21\\% relative reduction in LM perplexity by learning a SSE of 4 large LSTM models, which has only 30\\% of model parameters and 70\\% of computations in total, as compared to the baseline large LSTM LM.\nTo the best of our knowledge, this work represents the first methodology and empirical study of integrating SG-MCMC, group sparse prior and network pruning together for learning NN ensembles.", "keywords": "ensemble learning;SG-MCMC;group sparse prior;network pruning", "primary_area": "", "supplementary_material": "", "author": "Yichi Zhang;Zhijian Ou", "authorids": "zhangyic17@mails.tsinghua.edu.cn;ozj@tsinghua.edu.cn", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nzhang2018learning,\ntitle={Learning Sparse Structured Ensembles with {SG}-{MCMC} and Network Pruning},\nauthor={Yichi Zhang and Zhijian Ou},\nyear={2018},\nurl={https://openreview.net/forum?id=r1uOhfb0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1uOhfb0W", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;3;5", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:JIso8yHPscIJ:scholar.google.com/&scioq=Learning+Sparse+Structured+Ensembles+with+SG-MCMC+and+Network+Pruning&hl=en&as_sdt=0,33", "gs_version_total": 4 }, { "id": "r1vccClCb", "title": "Neighbor-encoder", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a novel unsupervised representation learning framework called neighbor-encoder in which domain knowledge can be trivially incorporated into the learning process without modifying the general encoder-decoder architecture. In contrast to autoencoder, which reconstructs the input data, neighbor-encoder reconstructs the input data's neighbors. The proposed neighbor-encoder can be considered as a generalization of autoencoder as the input data can be treated as the nearest neighbor of itself with zero distance. By reformulating the representation learning problem as a neighbor reconstruction problem, domain knowledge can be easily incorporated with appropriate definition of similarity or distance between objects. As such, any existing similarity search algorithms can be easily integrated into our framework. Applications of other algorithms (e.g., association rule mining) in our framework is also possible since the concept of ``neighbor\" is an abstraction which can be appropriately defined differently in different contexts. We have demonstrated the effectiveness of our framework in various domains, including images, time series, music, etc., with various neighbor definitions. Experimental results show that neighbor-encoder outperforms autoencoder in most scenarios we considered.", "keywords": "unsupervised learning;representation learning;autoencoder", "primary_area": "", "supplementary_material": "", "author": "Chin-Chia Michael Yeh;Yan Zhu;Evangelos E. Papalexakis;Abdullah Mueen;Eamonn Keogh", "authorids": "myeh003@ucr.edu;yzhu015@ucr.edu;epapalex@cs.ucr.edu;mueen@unm.edu;eamonn@cs.ucr.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nmichael2018neighborencoder,\ntitle={Neighbor-encoder},\nauthor={Chin-Chia Michael Yeh and Yan Zhu and Evangelos E. Papalexakis and Abdullah Mueen and Eamonn Keogh},\nyear={2018},\nurl={https://openreview.net/forum?id=r1vccClCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1vccClCb", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;5;4", "rating_avg": 5.0, "confidence_avg": 4.333333333333333, "replies_avg": 9, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Fv-uR0oFqFMJ:scholar.google.com/&scioq=Neighbor-encoder&hl=en&as_sdt=0,5", "gs_version_total": 2 }, { "title": "HexaConv", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/77", "id": "r1vuQG-CW", "author_site": "Emiel Hoogeboom, Jorn Peters, Taco Cohen, Max Welling", "tldr": "We introduce G-HexaConv, a group equivariant convolutional neural network on hexagonal lattices.", "abstract": "The effectiveness of Convolutional Neural Networks stems in large part from their ability to exploit the translation invariance that is inherent in many learning problems. Recently, it was shown that CNNs can exploit other invariances, such as rotation invariance, by using group convolutions instead of planar convolutions. However, for reasons of performance and ease of implementation, it has been necessary to limit the group convolution to transformations that can be applied to the filters without interpolation. Thus, for images with square pixels, only integer translations, rotations by multiples of 90 degrees, and reflections are admissible.\n\nWhereas the square tiling provides a 4-fold rotational symmetry, a hexagonal tiling of the plane has a 6-fold rotational symmetry. In this paper we show how one can efficiently implement planar convolution and group convolution over hexagonal lattices, by re-using existing highly optimized convolution routines. We find that, due to the reduced anisotropy of hexagonal filters, planar HexaConv provides better accuracy than planar convolution with square filters, given a fixed parameter budget. Furthermore, we find that the increased degree of symmetry of the hexagonal grid increases the effectiveness of group convolutions, by allowing for more parameter sharing. We show that our method significantly outperforms conventional CNNs on the AID aerial scene classification dataset, even outperforming ImageNet pre-trained models.", "keywords": "hexagonal;group;symmetry;representation learning;rotation;equivariance;invariance", "primary_area": "", "supplementary_material": "", "author": "Emiel Hoogeboom;Jorn W.T. Peters;Taco S. Cohen;Max Welling", "authorids": "e.hoogeboom@gmail.com;jornpeters@gmail.com;taco.cohen@gmail.com;welling.max@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nhoogeboom2018hexaconv,\ntitle={HexaConv},\nauthor={Emiel Hoogeboom and Jorn W.T. Peters and Taco S. Cohen and Max Welling},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=r1vuQG-CW},\n}", "github": "[![github](/images/github_icon.svg) ehoogeboom/hexaconv](https://github.com/ehoogeboom/hexaconv)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;4;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 130, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3503620825946735449&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=r1vuQG-CW", "pdf": "https://openreview.net/pdf?id=r1vuQG-CW", "email": ";;;", "author_num": 4 }, { "title": "Few-shot Autoregressive Density Estimation: Towards Learning to Learn Distributions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/189", "id": "r1wEFyWCW", "author_site": "Scott Reed, Yutian Chen, Thomas Paine, Aaron v den, S. M. Ali Eslami, Danilo Jimenez Rezende, Oriol Vinyals, Nando de Freitas", "tldr": "Few-shot learning PixelCNN", "abstract": "Deep autoregressive models have shown state-of-the-art performance in density estimation for natural images on large-scale datasets such as ImageNet. However, such models require many thousands of gradient-based weight updates and unique image examples for training. Ideally, the models would rapidly learn visual concepts from only a handful of examples, similar to the manner in which humans learns across many vision tasks. In this paper, we show how 1) neural attention and 2) meta learning techniques can be used in combination with autoregressive models to enable effective few-shot density estimation. Our proposed modifications to PixelCNN result in state-of-the art few-shot density estimation on the Omniglot dataset. Furthermore, we visualize the learned attention policy and find that it learns intuitive algorithms for simple tasks such as image mirroring on ImageNet and handwriting on Omniglot without supervision. Finally, we extend the model to natural images and demonstrate few-shot image generation on the Stanford Online Products dataset.", "keywords": "few-shot learning;density models;meta learning", "primary_area": "", "supplementary_material": "", "author": "Scott Reed;Yutian Chen;Thomas Paine;A\u00e4ron van den Oord;S. M. Ali Eslami;Danilo Rezende;Oriol Vinyals;Nando de Freitas", "authorids": "reedscot@google.com;yutianc@google.com;tpaine@google.com;avdnoord@google.com;aeslami@google.com;danilor@google.com;vinyals@google.com;nandodefreitas@google.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\nreed2018fewshot,\ntitle={Few-shot Autoregressive Density Estimation: Towards Learning to Learn Distributions},\nauthor={Scott Reed and Yutian Chen and Thomas Paine and A\u00e4ron van den Oord and S. M. Ali Eslami and Danilo Rezende and Oriol Vinyals and Nando de Freitas},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=r1wEFyWCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "5;4;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 6, "authors#_avg": 8, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 99, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12042481610635531058&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=r1wEFyWCW", "pdf": "https://openreview.net/pdf?id=r1wEFyWCW", "email": ";;;;;;;", "author_num": 8 }, { "id": "rJ1RPJWAW", "title": "Learnability of Learned Neural Networks", "track": "main", "status": "Reject", "tldr": "Exploring the Learnability of Learned Neural Networks", "abstract": "This paper explores the simplicity of learned neural networks under various settings: learned on real vs random data, varying size/architecture and using large minibatch size vs small minibatch size. The notion of simplicity used here is that of learnability i.e., how accurately can the prediction function of a neural network be learned from labeled samples from it. While learnability is different from (in fact often higher than) test accuracy, the results herein suggest that there is a strong correlation between small generalization errors and high learnability.\nThis work also shows that there exist significant qualitative differences in shallow networks as compared to popular deep networks. More broadly, this paper extends in a new direction, previous work on understanding the properties of learned neural networks. Our hope is that such an empirical study of understanding learned neural networks might shed light on the right assumptions that can be made for a theoretical study of deep learning.", "keywords": "Learnability;Generalizability;Understanding Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Rahul Anand Sharma;Navin Goyal;Monojit Choudhury;Praneeth Netrapalli", "authorids": "t-rahsha@microsoft.com;navingo@microsoft.com;monojitc@microsoft.com;praneeth@microsoft.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nanand2018learnability,\ntitle={Learnability of Learned Neural Networks},\nauthor={Rahul Anand Sharma and Navin Goyal and Monojit Choudhury and Praneeth Netrapalli},\nyear={2018},\nurl={https://openreview.net/forum?id=rJ1RPJWAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rJ1RPJWAW", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1243767346231576945&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "rJ1os4eHf", "title": "Adaptive Weight Sparsity for Training Deep Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We introduce adaptive weight sparsity, an algorithm that allows a neural network to learn a sparse connection pattern during training. We demonstrate that the proposed algorithm shows performance benefits across a wide variety of tasks and network structures, improving state-of-the-art results for recurrent networks of comparable size. We show that adaptive weight sparsity outperforms traditional pruning-based approaches to learning sparse configurations on convolutional and recurrent networks. We offer insights into the algorithm's behavior, demonstrating that training-time adaptivity is crucial to the success of the method and uncovering an interpretable evolution toward small-world network structures.", "keywords": "deep learning;sparsity;adaptive methods", "primary_area": "", "supplementary_material": "", "author": "Michael James;Jack Lindsey;Ilya Sharapov", "authorids": "michael@cerebras.net;jacklindsey@stanford.edu;ilya@cerebras.net", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJ1os4eHf", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;2;3", "rating_avg": 4.0, "confidence_avg": 3.0, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": -0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hNUpSGMk-L4J:scholar.google.com/&scioq=Adaptive+Weight+Sparsity+for+Training+Deep+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "SGD Learns Over-parameterized Networks that Provably Generalize on Linearly Separable Data", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/254", "id": "rJ33wwxRb", "author_site": "Alon Brutzkus, Amir Globerson, Eran Malach, Shai Shalev-Shwartz", "tldr": "We show that SGD learns two-layer over-parameterized neural networks with Leaky ReLU activations that provably generalize on linearly separable data.", "abstract": "Neural networks exhibit good generalization behavior in the\nover-parameterized regime, where the number of network parameters\nexceeds the number of observations. Nonetheless,\ncurrent generalization bounds for neural networks fail to explain this\nphenomenon. In an attempt to bridge this gap, we study the problem of\nlearning a two-layer over-parameterized neural network, when the data is generated by a linearly separable function. In the case where the network has Leaky\nReLU activations, we provide both optimization and generalization guarantees for over-parameterized networks.\nSpecifically, we prove convergence rates of SGD to a global\nminimum and provide generalization guarantees for this global minimum\nthat are independent of the network size. \nTherefore, our result clearly shows that the use of SGD for optimization both finds a global minimum, and avoids overfitting despite the high capacity of the model. This is the first theoretical demonstration that SGD can avoid overfitting, when learning over-specified neural network classifiers.", "keywords": "Deep Learning;Non-convex Optmization;Generalization;Learning Theory;Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Alon Brutzkus;Amir Globerson;Eran Malach;Shai Shalev-Shwartz", "authorids": "alonbrutzkus@mail.tau.ac.il;amir.globerson@gmail.com;eran.malach@mail.huji.ac.il;shais@cs.huji.ac.il", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nbrutzkus2018sgd,\ntitle={{SGD} Learns Over-parameterized Networks that Provably Generalize on Linearly Separable Data},\nauthor={Alon Brutzkus and Amir Globerson and Eran Malach and Shai Shalev-Shwartz},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rJ33wwxRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;8", "confidence": "3;3;4", "rating_avg": 7.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 305, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5213254119475520765&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=rJ33wwxRb", "pdf": "https://openreview.net/pdf?id=rJ33wwxRb", "email": ";;;", "author_num": 4 }, { "id": "rJ3fy0k0Z", "title": "Deterministic Policy Imitation Gradient Algorithm", "track": "main", "status": "Reject", "tldr": "We propose a model free imitation learning algorithm that is able to reduce number of interactions with environment in comparison with state-of-the-art imitation learning algorithm namely GAIL.", "abstract": "The goal of imitation learning (IL) is to enable a learner to imitate an expert\u2019s behavior given the expert\u2019s demonstrations. Recently, generative adversarial imitation learning (GAIL) has successfully achieved it even on complex continuous control tasks. However, GAIL requires a huge number of interactions with environment during training. We believe that IL algorithm could be more applicable to the real-world environments if the number of interactions could be reduced. To this end, we propose a model free, off-policy IL algorithm for continuous control. The keys of our algorithm are two folds: 1) adopting deterministic policy that allows us to derive a novel type of policy gradient which we call deterministic policy imitation gradient (DPIG), 2) introducing a function which we call state screening function (SSF) to avoid noisy policy updates with states that are not typical of those appeared on the expert\u2019s demonstrations. Experimental results show that our algorithm can achieve the goal of IL with at least tens of times less interactions than GAIL on a variety of continuous control tasks.", "keywords": "Imitation Learning", "primary_area": "", "supplementary_material": "", "author": "Fumihiro Sasaki;Atsuo Kawaguchi", "authorids": "fumihiro.fs.sasaki@nts.ricoh.co.jp;atsuo.kawaguchi@nts.ricoh.co.jp", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nsasaki2018deterministic,\ntitle={Deterministic Policy Imitation Gradient Algorithm},\nauthor={Fumihiro Sasaki and Atsuo Kawaguchi},\nyear={2018},\nurl={https://openreview.net/forum?id=rJ3fy0k0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJ3fy0k0Z", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GNL8_8skSOwJ:scholar.google.com/&scioq=Deterministic+Policy+Imitation+Gradient+Algorithm&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rJ4uaX2aW", "title": "Large Batch Training of Convolutional Networks with Layer-wise Adaptive Rate Scaling", "track": "main", "status": "Reject", "tldr": "A new large batch training algorithm based on Layer-wise Adaptive Rate Scaling (LARS); using LARS, we scaled AlexNet and ResNet-50 to a batch of 16K.", "abstract": "A common way to speed up training of large convolutional networks is to add computational units. Training is then performed using data-parallel synchronous Stochastic Gradient Descent (SGD) with a mini-batch divided between computational units. With an increase in the number of nodes, the batch size grows. However, training with a large batch often results in lower model accuracy. We argue that the current recipe for large batch training (linear learning rate scaling with warm-up) is not general enough and training may diverge. To overcome these optimization difficulties, we propose a new training algorithm based on Layer-wise Adaptive Rate Scaling (LARS). Using LARS, we scaled AlexNet and ResNet-50 to a batch size of 16K.", "keywords": "large batch;LARS;adaptive rate scaling", "primary_area": "", "supplementary_material": "", "author": "Boris Ginsburg;Igor Gitman;Yang You", "authorids": "bginsburg@nvidia.com;igitman@andrew.cmu.edu;youyang@cs.berkeley.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nginsburg2018large,\ntitle={Large Batch Training of Convolutional Networks with Layer-wise Adaptive Rate Scaling},\nauthor={Boris Ginsburg and Igor Gitman and Yang You},\nyear={2018},\nurl={https://openreview.net/forum?id=rJ4uaX2aW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rJ4uaX2aW", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;3;5", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13307369966517000873&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "rJ5C67-C-", "title": "Hyperedge2vec: Distributed Representations for Hyperedges", "track": "main", "status": "Reject", "tldr": "", "abstract": "Data structured in form of overlapping or non-overlapping sets is found in a variety of domains, sometimes explicitly but often subtly. For example, teams, which are of prime importance in social science studies are \\enquote{sets of individuals}; \\enquote{item sets} in pattern mining are sets; and for various types of analysis in language studies a sentence can be considered as a \\enquote{set or bag of words}. Although building models and inference algorithms for structured data has been an important task in the fields of machine learning and statistics, research on \\enquote{set-like} data still remains less explored. Relationships between pairs of elements can be modeled as edges in a graph. However, modeling relationships that involve all members of a set, a hyperedge is a more natural representation for the set. In this work, we focus on the problem of embedding hyperedges in a hypergraph (a network of overlapping sets) to a low dimensional vector space. We propose a probabilistic deep-learning based method as well as a tensor-based algebraic model, both of which capture the hypergraph structure in a principled manner without loosing set-level information. Our central focus is to highlight the connection between hypergraphs (topology), tensors (algebra) and probabilistic models. We present a number of interesting baselines, some of which adapt existing node-level embedding models to the hyperedge-level, as well as sequence based language techniques which are adapted for set structured hypergraph topology. The performance is evaluated with a network of social groups and a network of word phrases. Our experiments show that accuracy wise our methods perform similar to those of baselines which are not designed for hypergraphs. Moreover, our tensor based method is quiet efficient as compared to deep-learning based auto-encoder method. We therefore, argue that we have proposed more general methods which are suited for hypergraphs (and therefore also for graphs) while maintaining accuracy and efficiency. ", "keywords": "hypergraph;representation learning;tensors", "primary_area": "", "supplementary_material": "", "author": "Ankit Sharma;Shafiq Joty;Himanshu Kharkwal;Jaideep Srivastava", "authorids": "sharm170@umn.edu;srjoty@ntu.edu.sg;himanshukharkwal765@gmail.com;srivasta@umn.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsharma2018hyperedgevec,\ntitle={Hyperedge2vec: Distributed Representations for Hyperedges},\nauthor={Ankit Sharma and Shafiq Joty and Himanshu Kharkwal and Jaideep Srivastava},\nyear={2018},\nurl={https://openreview.net/forum?id=rJ5C67-C-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rJ5C67-C-", "pdf_size": 0, "rating": "5;5;5", "confidence": "3;3;4", "rating_avg": 5.0, "confidence_avg": 3.3333333333333335, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10110666618567809959&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "rJ695PxRW", "title": "Discovering Order in Unordered Datasets: Generative Markov Networks", "track": "main", "status": "Reject", "tldr": "Propose to observe implicit orders in datasets in a generative model viewpoint.", "abstract": "The assumption that data samples are independently identically distributed is the backbone of many learning algorithms. Nevertheless, datasets often exhibit rich structures in practice, and we argue that there exist some unknown orders within the data instances. Aiming to find such orders, we introduce a novel Generative Markov Network (GMN) which we use to extract the order of data instances automatically. Specifically, we assume that the instances are sampled from a Markov chain. Our goal is to learn the transitional operator of the chain as well as the generation order by maximizing the generation probability under all possible data permutations. One of our key ideas is to use neural networks as a soft lookup table for approximating the possibly huge, but discrete transition matrix. This strategy allows us to amortize the space complexity with a single model and make the transitional operator generalizable to unseen instances. To ensure the learned Markov chain is ergodic, we propose a greedy batch-wise permutation scheme that allows fast training. Empirically, we evaluate the learned Markov chain by showing that GMNs are able to discover orders among data instances and also perform comparably well to state-of-the-art methods on the one-shot recognition benchmark task.", "keywords": "Markov chain;discovering orders;generative model;one-shot", "primary_area": "", "supplementary_material": "", "author": "Yao-Hung Hubert Tsai;Han Zhao;Nebojsa Jojic;Ruslan Salakhutdinov", "authorids": "yaohungt@cs.cmu.edu;han.zhao@cs.cmu.edu;jojic@microsoft.com;rsalakhu@cs.cmu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nhubert2018discovering,\ntitle={Discovering Order in Unordered Datasets: Generative Markov Networks},\nauthor={Yao-Hung Hubert Tsai and Han Zhao and Nebojsa Jojic and Ruslan Salakhutdinov},\nyear={2018},\nurl={https://openreview.net/forum?id=rJ695PxRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJ695PxRW", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3511301920503823245&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "rJ6iJmWCW", "title": "POLICY DRIVEN GENERATIVE ADVERSARIAL NETWORKS FOR ACCENTED SPEECH GENERATION", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we propose the generation of accented speech using generative adversarial\nnetworks. Through this work we make two main contributions a) The\nability to condition latent representations while generating realistic speech samples\nb) The ability to efficiently generate long speech samples by using a novel\nlatent variable transformation module that is trained using policy gradients. Previous\nmethods are limited in being able to generate only relatively short samples\nor are not very efficient at generating long samples. The generated speech samples\nare validated through a number of various evaluation measures viz, a WGAN\ncritic loss and through subjective scores on user evaluations against competitive\nspeech synthesis baselines and detailed ablation analysis of the proposed model.\nThe evaluations demonstrate that the model generates realistic long speech samples\nconditioned on accent efficiently.", "keywords": "speech;generation;accent;gan;adversarial;reinforcement;memory;lstm;policy;gradients;human", "primary_area": "", "supplementary_material": "", "author": "Prannay Khosla;Preethi Jyothi;Vinay P. Namboodiri;Mukundhan Srinivasan", "authorids": "prannayk@iitk.ac.in;pjyothi@cse.iitb.ac.in;vinaypn@cse.iitk.ac.in;msrinivasan@nvidia.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nkhosla2018policy,\ntitle={{POLICY} {DRIVEN} {GENERATIVE} {ADVERSARIAL} {NETWORKS} {FOR} {ACCENTED} {SPEECH} {GENERATION}},\nauthor={Prannay Khosla and Preethi Jyothi and Vinay P. Namboodiri and Mukundhan Srinivasan},\nyear={2018},\nurl={https://openreview.net/forum?id=rJ6iJmWCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJ6iJmWCW", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eLOif20QLXUJ:scholar.google.com/&scioq=POLICY+DRIVEN+GENERATIVE+ADVERSARIAL+NETWORKS+FOR+ACCENTED+SPEECH+GENERATION&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "rJ7RBNe0-", "title": "Generative Models for Alignment and Data Efficiency in Language", "track": "main", "status": "Reject", "tldr": "", "abstract": "We examine how learning from unaligned data can improve both the data efficiency of supervised tasks as well as enable alignments without any supervision. For example, consider unsupervised machine translation: the input is two corpora of English and French, and the task is to translate from one language to the other but without any pairs of English and French sentences. To address this, we develop feature-matching autoencoders (FMAEs). FMAEs ensure that the marginal distribution of feature layers are preserved across forward and inverse mappings between domains. We show that FMAEs achieve state of the art for data efficiency and alignment across three tasks: text decipherment, sentiment transfer, and neural machine translation for English-to-German and English-to-French. Most compellingly, FMAEs achieve state of the art for neural translation with limited supervision, with significant BLEU score differences of up to 5.7 and 6.3 over traditional supervised models. Furthermore, on English-to-German, they outperform last year's best fully supervised models such as ByteNet (Kalchbrenner et al., 2016) while using only half as many supervised examples.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dustin Tran;Yura Burda;Ilya Sutskever", "authorids": "dustin@cs.columbia.edu;yburda@openai.com;ilyasu@openai.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ntran2018generative,\ntitle={Generative Models for Alignment and Data Efficiency in Language},\nauthor={Dustin Tran and Yura Burda and Ilya Sutskever},\nyear={2018},\nurl={https://openreview.net/forum?id=rJ7RBNe0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rJ7RBNe0-", "pdf_size": 0, "rating": "2;4;5", "confidence": "3;3;3", "rating_avg": 3.6666666666666665, "confidence_avg": 3.0, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:VQuLn5LrvLUJ:scholar.google.com/&scioq=Generative+Models+for+Alignment+and+Data+Efficiency+in+Language&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "rJ7yZ2P6-", "title": "Enhance Word Representation for Out-of-Vocabulary on Ubuntu Dialogue Corpus", "track": "main", "status": "Reject", "tldr": "Combine information between pre-built word embedding and task-specific word representation to address out-of-vocabulary issue", "abstract": "Ubuntu dialogue corpus is the largest public available dialogue corpus to make it feasible to build end-to-end\ndeep neural network models directly from the conversation data. One challenge of Ubuntu dialogue corpus is \nthe large number of out-of-vocabulary words. In this paper we proposed an algorithm which combines the general pre-trained word embedding vectors with those generated on the task-specific training set to address this issue. We integrated character embedding into Chen et al's Enhanced LSTM method (ESIM) and used it to evaluate the effectiveness of our proposed method. For the task of next utterance selection, the proposed method has demonstrated a significant performance improvement against original ESIM and the new model has achieved state-of-the-art results on both Ubuntu dialogue corpus and Douban conversation corpus. In addition, we investigated the performance impact of end-of-utterance and end-of-turn token tags. ", "keywords": "next utterance selection;ubuntu dialogue corpus;out-of-vocabulary;word representation", "primary_area": "", "supplementary_material": "", "author": "JIANXIONG DONG;Jim Huang", "authorids": "jdongca2003@gmail.com;ccjimhuang@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ndong2018enhance,\ntitle={Enhance Word Representation for Out-of-Vocabulary on Ubuntu Dialogue Corpus},\nauthor={JIANXIONG DONG and Jim Huang},\nyear={2018},\nurl={https://openreview.net/forum?id=rJ7yZ2P6-},\n}", "github": "[![github](/images/github_icon.svg) jdongca2003/next_utterance_selection](https://github.com/jdongca2003/next_utterance_selection)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer5", "site": "https://openreview.net/forum?id=rJ7yZ2P6-", "pdf_size": 0, "rating": "3;5;6", "confidence": "5;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 19, "authors#_avg": 2, "corr_rating_confidence": -0.9819805060619659, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10868817939143429862&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "rJ8X2VT7M", "title": "Information Theoretic Co-Training", "track": "main", "status": "Withdraw", "tldr": "Presents an information theoretic training objective for co-training and demonstrates its power in unsupervised learning of phonetics.", "abstract": "This paper introduces an information theoretic co-training objective for unsupervised learning. We consider the problem of predicting the future. Rather than predict future sensations (image pixels or sound waves) we predict ``hypotheses'' to be confirmed by future sensations. More formally, we assume a population distribution on pairs $(x,y)$ where we can think of $x$ as a past sensation and $y$ as a future sensation. We train both a predictor model $P_\\Phi(z|x)$ and a confirmation model $P_\\Psi(z|y)$ where we view $z$ as hypotheses (when predicted) or facts (when confirmed). For a population distribution on pairs $(x,y)$ we focus on the problem of measuring the mutual information between $x$ and $y$. By the data processing inequality this mutual information is at least as large as the mutual information between $x$ and $z$ under the distribution on triples $(x,z,y)$ defined by the confirmation model $P_\\Psi(z|y)$. The information theoretic training objective for $P_\\Phi(z|x)$ and $P_\\Psi(z|y)$ can be viewed as a form of co-training where we want the prediction from $x$ to match the confirmation from $y$. We give experiments on applications to learning phonetics on the TIMIT dataset.", "keywords": "co-training;phonetics;unsupervised learning;mutual information", "primary_area": "", "supplementary_material": "", "author": "David McAllester", "authorids": "mcallester@ttic.edu", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJ8X2VT7M", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 3, "authors#_avg": 1, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7874802414546570894&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "rJ8rHkWRb", "title": "A Simple Fully Connected Network for Composing Word Embeddings from Characters", "track": "main", "status": "Reject", "tldr": "A fully connected architecture is used to produce word embeddings from character representations, outperforms traditional embeddings and provides insight into sparsity and dropout.", "abstract": "This work introduces a simple network for producing character aware word embeddings. Position agnostic and position aware character embeddings are combined to produce an embedding vector for each word. The learned word representations are shown to be very sparse and facilitate improved results on language modeling tasks, despite using markedly fewer parameters, and without the need to apply dropout. A final experiment suggests that weight sharing contributes to sparsity, increases performance, and prevents overfitting.", "keywords": "natural language processing;word embeddings;language models;neural network;deep learning;sparsity;dropout", "primary_area": "", "supplementary_material": "", "author": "Michael Traynor;Thomas Trappenberg", "authorids": "mike.sk.traynor@gmail.com;trappenberg@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntraynor2018a,\ntitle={A Simple Fully Connected Network for Composing Word Embeddings from Characters},\nauthor={Michael Traynor and Thomas Trappenberg},\nyear={2018},\nurl={https://openreview.net/forum?id=rJ8rHkWRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rJ8rHkWRb", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;5;4", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:X9-hZp8s9N8J:scholar.google.com/&scioq=A+Simple+Fully+Connected+Network+for+Composing+Word+Embeddings+from+Characters&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rJBiunlAW", "title": "Training RNNs as Fast as CNNs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Common recurrent neural network architectures scale poorly due to the intrinsic difficulty in parallelizing their state computations. In this work, we propose the Simple Recurrent Unit (SRU) architecture, a recurrent unit that simplifies the computation and exposes more parallelism. In SRU, the majority of computation for each step is independent of the recurrence and can be easily parallelized. SRU is as fast as a convolutional layer and 5-10x faster than an optimized LSTM implementation. We study SRUs on a wide range of applications, including classification, question answering, language modeling, translation and speech recognition. Our experiments demonstrate the effectiveness of SRU and the trade-off it enables between speed and performance. ", "keywords": "recurrent neural networks;natural language processing", "primary_area": "", "supplementary_material": "", "author": "Tao Lei;Yu Zhang;Yoav Artzi", "authorids": "tao@asapp.com;yzhang87@csail.mit.edu;yoav@cs.cornell.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlei2018training,\ntitle={Training {RNN}s as Fast as {CNN}s},\nauthor={Tao Lei and Yu Zhang and Yoav Artzi},\nyear={2018},\nurl={https://openreview.net/forum?id=rJBiunlAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rJBiunlAW", "pdf_size": 0, "rating": "4;7;8", "confidence": "5;4;5", "rating_avg": 6.333333333333333, "confidence_avg": 4.666666666666667, "replies_avg": 22, "authors#_avg": 3, "corr_rating_confidence": -0.2773500981126146, "gs_citation": 209, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12329870459937074539&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "rJBwoM-Cb", "title": "Neural Tree Transducers for Tree to Tree Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "We introduce a novel approach to tree-to-tree learning, the neural tree transducer (NTT), a top-down depth first context-sensitive tree decoder, which is paired with recursive neural encoders. Our method works purely on tree-to-tree manipulations rather than sequence-to-tree or tree-to-sequence and is able to encode and decode multiple depth trees. We compare our method to sequence-to-sequence models applied to serializations of the trees and show that our method outperforms previous methods for tree-to-tree transduction. ", "keywords": "deep learning;tree transduction", "primary_area": "", "supplementary_material": "", "author": "Jo\u00e3o Sedoc;Dean Foster;Lyle Ungar", "authorids": "joao@cis.upenn.edu;dean@foster.net;ungar@cis.upenn.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsedoc2018neural,\ntitle={Neural Tree Transducers for Tree to Tree Learning},\nauthor={Jo\u00e3o Sedoc and Dean Foster and Lyle Ungar},\nyear={2018},\nurl={https://openreview.net/forum?id=rJBwoM-Cb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rJBwoM-Cb", "pdf_size": 0, "rating": "2;3;7", "confidence": "5;4;4", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": -0.654653670707977, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5148858220359031838&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "rJFOptp6Z", "title": "Model Distillation with Knowledge Transfer from Face Classification to Alignment and Verification", "track": "main", "status": "Reject", "tldr": "We take face recognition as a breaking point and propose model distillation with knowledge transfer from face classification to alignment and verification", "abstract": "Knowledge distillation is a potential solution for model compression. The idea is to make a small student network imitate the target of a large teacher network, then the student network can be competitive to the teacher one. Most previous studies focus on model distillation in the classification task, where they propose different architectures and initializations for the student network. However, only the classification task is not enough, and other related tasks such as regression and retrieval are barely considered. To solve the problem, in this paper, we take face recognition as a breaking point and propose model distillation with knowledge transfer from face classification to alignment and verification. By selecting appropriate initializations and targets in the knowledge transfer, the distillation can be easier in non-classification tasks. Experiments on the CelebA and CASIA-WebFace datasets demonstrate that the student network can be competitive to the teacher one in alignment and verification, and even surpasses the teacher network under specific compression rates. In addition, to achieve stronger knowledge transfer, we also use a common initialization trick to improve the distillation performance of classification. Evaluations on the CASIA-Webface and large-scale MS-Celeb-1M datasets show the effectiveness of this simple trick.", "keywords": "distill;transfer;classification;alignment;verification", "primary_area": "", "supplementary_material": "", "author": "Chong Wang;Xipeng Lan;Yangang Zhang", "authorids": "chongwang.nlpr@gmail.com;xipeng.lan@gmail.com;caveman1984@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nwang2018model,\ntitle={Model Distillation with Knowledge Transfer from Face Classification to Alignment and Verification},\nauthor={Chong Wang and Xipeng Lan and Yangang Zhang},\nyear={2018},\nurl={https://openreview.net/forum?id=rJFOptp6Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rJFOptp6Z", "pdf_size": 0, "rating": "3;3;5", "confidence": "4;4;5", "rating_avg": 3.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": 1.0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2689487347478082230&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "rJGY8GbR-", "title": "Deep Mean Field Theory: Layerwise Variance and Width Variation as Methods to Control Gradient Explosion", "track": "main", "status": "Workshop", "tldr": "By setting the width or the initialization variance of each layer differently, we can actually subdue gradient explosion problems in residual networks (with fully connected layers and no batchnorm). A mathematical theory is developed that not only tells you how to do it, but also surprisingly is able to predict, after you apply such tricks, how fast your network trains to achieve a certain test set performance. This is some black magic stuff, and it's called \"Deep Mean Field Theory.\"", "abstract": "\tA recent line of work has studied the statistical properties of neural networks to great success from a {\\it mean field theory} perspective, making and verifying very precise predictions of neural network behavior and test time performance.\n\tIn this paper, we build upon these works to explore two methods for taming the behaviors of random residual networks (with only fully connected layers and no batchnorm).\n\tThe first method is {\\it width variation (WV)}, i.e. varying the widths of layers as a function of depth.\n\tWe show that width decay reduces gradient explosion without affecting the mean forward dynamics of the random network.\n\tThe second method is {\\it variance variation (VV)}, i.e. changing the initialization variances of weights and biases over depth.\n\tWe show VV, used appropriately, can reduce gradient explosion of tanh and ReLU resnets from $\\exp(\\Theta(\\sqrt L))$ and $\\exp(\\Theta(L))$ respectively to constant $\\Theta(1)$.\n\tA complete phase-diagram is derived for how variance decay affects different dynamics, such as those of gradient and activation norms.\n\tIn particular, we show the existence of many phase transitions where these dynamics switch between exponential, polynomial, logarithmic, and even constant behaviors.\n\tUsing the obtained mean field theory, we are able to track surprisingly well how VV at initialization time affects training and test time performance on MNIST after a set number of epochs: the level sets of test/train set accuracies coincide with the level sets of the expectations of certain gradient norms or of metric expressivity (as defined in \\cite{yang_meanfield_2017}), a measure of expansion in a random neural network.\n\tBased on insights from past works in deep mean field theory and information geometry, we also provide a new perspective on the gradient explosion/vanishing problems: they lead to ill-conditioning of the Fisher information matrix, causing optimization troubles.", "keywords": "mean field;dynamics;residual network;variance variation;width variation;initialization", "primary_area": "", "supplementary_material": "", "author": "Greg Yang;Sam S. Schoenholz", "authorids": "gregyang@microsoft.com;schsam@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nyang2018deep,\ntitle={Deep Mean Field Theory: Layerwise Variance and Width Variation as Methods to Control Gradient Explosion},\nauthor={Greg Yang and Sam S. Schoenholz},\nyear={2018},\nurl={https://openreview.net/forum?id=rJGY8GbR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=rJGY8GbR-", "pdf_size": 0, "rating": "5;5;7", "confidence": "1;3;3", "rating_avg": 5.666666666666667, "confidence_avg": 2.3333333333333335, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": 0.5, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3947818681877352708&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Emergent Communication in a Multi-Modal, Multi-Step Referential Game", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/212", "id": "rJGZq6g0-", "author_site": "Katrina Evtimova, Andrew Drozdov, Douwe Kiela, Kyunghyun Cho", "tldr": "", "abstract": "Inspired by previous work on emergent communication in referential games, we propose a novel multi-modal, multi-step referential game, where the sender and receiver have access to distinct modalities of an object, and their information exchange is bidirectional and of arbitrary duration. The multi-modal multi-step setting allows agents to develop an internal communication significantly closer to natural language, in that they share a single set of messages, and that the length of the conversation may vary according to the difficulty of the task. We examine these properties empirically using a dataset consisting of images and textual descriptions of mammals, where the agents are tasked with identifying the correct object. Our experiments indicate that a robust and efficient communication protocol emerges, where gradual information exchange informs better predictions and higher communication bandwidth improves generalization.", "keywords": "emergent communication;multi-agent systems;multi-modal", "primary_area": "", "supplementary_material": "", "author": "Katrina Evtimova;Andrew Drozdov;Douwe Kiela;Kyunghyun Cho", "authorids": "kve216@nyu.edu;apd283@nyu.edu;dkiela@fb.com;kyunghyun.cho@nyu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nevtimova2018emergent,\ntitle={Emergent Communication in a Multi-Modal, Multi-Step Referential Game},\nauthor={Katrina Evtimova and Andrew Drozdov and Douwe Kiela and Kyunghyun Cho},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rJGZq6g0-},\n}", "github": "[![github](/images/github_icon.svg) nyu-dl/MultimodalGame](https://github.com/nyu-dl/MultimodalGame)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;3;4", "rating_avg": 7.0, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 117, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6581857213563474520&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=rJGZq6g0-", "pdf": "https://openreview.net/pdf?id=rJGZq6g0-", "email": ";;;", "author_num": 4 }, { "id": "rJHcpW-CW", "title": "NOVEL AND EFFECTIVE PARALLEL MIX-GENERATOR GENERATIVE ADVERSARIAL NETWORKS", "track": "main", "status": "Reject", "tldr": "multi generator to capture Pdata, solve the competition and one-beat-all problem", "abstract": "In this paper, we propose a mix-generator generative adversarial networks (PGAN) model that works in parallel by mixing multiple disjoint generators to approximate a complex real distribution. In our model, we propose an adjustment component that collects all the generated data points from the generators, learns the boundary between each pair of generators, and provides error to separate the support of each of the generated distributions. To overcome the instability in a multiplayer game, a shrinkage adjustment component method is introduced to gradually reduce the boundary between generators during the training procedure. To address the linearly growing training time problem in a multiple generators model, we propose a method to train the generators in parallel. This means that our work can be scaled up to large parallel computation frameworks. We present an efficient loss function for the discriminator, an effective adjustment component, and a suitable generator. We also show how to introduce the decay factor to stabilize the training procedure. We have performed extensive experiments on synthetic datasets, MNIST, and CIFAR-10. These experiments reveal that the error provided by the adjustment component could successfully separate the generated distributions and each of the generators can stably learn a part of the real distribution even if only a few modes are contained in the real distribution.", "keywords": "neural networks;generative adversarial networks;parallel", "primary_area": "", "supplementary_material": "", "author": "Xia Xiao;Sanguthevar Rajasekaran", "authorids": "xia.xiao@uconn.edu;sanguthevar.rajasekaran@uconn.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nxiao2018novel,\ntitle={{NOVEL} {AND} {EFFECTIVE} {PARALLEL} {MIX}-{GENERATOR} {GENERATIVE} {ADVERSARIAL} {NETWORKS}},\nauthor={Xia Xiao and Sanguthevar Rajasekaran},\nyear={2018},\nurl={https://openreview.net/forum?id=rJHcpW-CW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJHcpW-CW", "pdf_size": 0, "rating": "3;5;6", "confidence": "5;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 6, "authors#_avg": 2, "corr_rating_confidence": -0.944911182523068, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4181628657139177267&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "rJIN_4lA-", "title": "Maintaining cooperation in complex social dilemmas using deep reinforcement learning", "track": "main", "status": "Reject", "tldr": "How can we build artificial agents that solve social dilemmas (situations where individuals face a temptation to increase their payoffs at a cost to total welfare)?", "abstract": "Social dilemmas are situations where individuals face a temptation to increase their payoffs at a cost to total welfare. Building artificially intelligent agents that achieve good outcomes in these situations is important because many real world interactions include a tension between selfish interests and the welfare of others. We show how to modify modern reinforcement learning methods to construct agents that act in ways that are simple to understand, nice (begin by cooperating), provokable (try to avoid being exploited), and forgiving (try to return to mutual cooperation). We show both theoretically and experimentally that such agents can maintain cooperation in Markov social dilemmas. Our construction does not require training methods beyond a modification of self-play, thus if an environment is such that good strategies can be constructed in the zero-sum case (eg. Atari) then we can construct agents that solve social dilemmas in this environment. ", "keywords": "reinforcement learning;cooperation;social dilemmas;game theory", "primary_area": "", "supplementary_material": "", "author": "Alexander Peysakhovich;Adam Lerer", "authorids": "alex.peys@gmail.com;alerer@fb.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\npeysakhovich2018maintaining,\ntitle={Maintaining cooperation in complex social dilemmas using deep reinforcement learning},\nauthor={Alexander Peysakhovich and Adam Lerer},\nyear={2018},\nurl={https://openreview.net/forum?id=rJIN_4lA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rJIN_4lA-", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 19, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 163, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13243617783662042150&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "rJIgf7bAZ", "title": "An inference-based policy gradient method for learning options", "track": "main", "status": "Reject", "tldr": "We develop a novel policy gradient method for the automatic learning of policies with options using a differentiable inference step.", "abstract": "In the pursuit of increasingly intelligent learning systems, abstraction plays a vital role in enabling sophisticated decisions to be made in complex environments. The options framework provides formalism for such abstraction over sequences of decisions. However most models require that options be given a priori, presumably specified by hand, which is neither efficient, nor scalable. Indeed, it is preferable to learn options directly from interaction with the environment. Despite several efforts, this remains a difficult problem: many approaches require access to a model of the environmental dynamics, and inferred options are often not interpretable, which limits our ability to explain the system behavior for verification or debugging purposes. In this work we develop a novel policy gradient method for the automatic learning of policies with options. This algorithm uses inference methods to simultaneously improve all of the options available to an agent, and thus can be employed in an off-policy manner, without observing option labels. Experimental results show that the options learned can be interpreted. Further, we find that the method presented here is more sample efficient than existing methods, leading to faster and more stable learning of policies with options.", "keywords": "reinforcement learning;hierarchy;options;inference", "primary_area": "", "supplementary_material": "", "author": "Matthew J. A. Smith;Herke van Hoof;Joelle Pineau", "authorids": "matthew.smith5@mail.mcgill.ca;herke.vanhoof@mail.mcgill.ca;jpineau@cs.mcgill.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nj.2018an,\ntitle={An inference-based policy gradient method for learning options},\nauthor={Matthew J. A. Smith and Herke van Hoof and Joelle Pineau},\nyear={2018},\nurl={https://openreview.net/forum?id=rJIgf7bAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJIgf7bAZ", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;5;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.333333333333333, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8204006548424630686&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "rJJzTyWCZ", "title": "Large-scale Cloze Test Dataset Designed by Teachers", "track": "main", "status": "Reject", "tldr": "A cloze test dataset designed by teachers to assess language proficiency", "abstract": "Cloze test is widely adopted in language exams to evaluate students' language proficiency. In this paper, we propose the first large-scale human-designed cloze test dataset CLOTH in which the questions were used in middle-school and high-school language exams. With the missing blanks carefully created by teachers and candidate choices purposely designed to be confusing, CLOTH requires a deeper language understanding and a wider attention span than previous automatically generated cloze datasets. We show humans outperform dedicated designed baseline models by a significant margin, even when the model is trained on sufficiently large external data. We investigate the source of the performance gap, trace model deficiencies to some distinct properties of CLOTH, and identify the limited ability of comprehending a long-term context to be the key bottleneck. In addition, we find that human-designed data leads to a larger gap between the model's performance and human performance when compared to automatically generated data. ", "keywords": "dataset;human-designed;language understanding", "primary_area": "", "supplementary_material": "", "author": "Qizhe Xie;Guokun Lai;Zihang Dai;Eduard Hovy", "authorids": "qizhex@gmail.com;guokun@cs.cmu.edu;zander.dai@gmail.com;hovy@cs.cmu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nxie2018largescale,\ntitle={Large-scale Cloze Test Dataset Designed by Teachers},\nauthor={Qizhe Xie and Guokun Lai and Zihang Dai and Eduard Hovy},\nyear={2018},\nurl={https://openreview.net/forum?id=rJJzTyWCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rJJzTyWCZ", "pdf_size": 0, "rating": "4;4;7", "confidence": "4;4;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15968398830167367076&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "rJL6pz-CZ", "title": "Transfer Learning on Manifolds via Learned Transport Operators", "track": "main", "status": "Reject", "tldr": "Learning transport operators on manifolds forms a valuable representation for doing tasks like transfer learning.", "abstract": "Within-class variation in a high-dimensional dataset can be modeled as being on a low-dimensional manifold due to the constraints of the physical processes producing that variation (e.g., translation, illumination, etc.). We desire a method for learning a representation of the manifolds induced by identity-preserving transformations that can be used to increase robustness, reduce the training burden, and encourage interpretability in machine learning tasks. In particular, what is needed is a representation of the transformation manifold that can robustly capture the shape of the manifold from the input data, generate new points on the manifold, and extend transformations outside of the training domain without significantly increasing the error. Previous work has proposed algorithms to efficiently learn analytic operators (called transport operators) that define the process of transporting one data point on a manifold to another. The main contribution of this paper is to define two transfer learning methods that use this generative manifold representation to learn natural transformations and incorporate them into new data. The first method uses this representation in a novel randomized approach to transfer learning that employs the learned generative model to map out unseen regions of the data space. These results are shown through demonstrations of transfer learning in a data augmentation task for few-shot image classification. The second method use of transport operators for injecting specific transformations into new data examples which allows for realistic image animation and informed data augmentation. These results are shown on stylized constructions using the classic swiss roll data structure and in demonstrations of transfer learning in a data augmentation task for few-shot image classification. We also propose the use of transport operators for injecting transformations into new data examples which allows for realistic image animation.", "keywords": "manifold learning;transfer learning", "primary_area": "", "supplementary_material": "", "author": "Marissa Connor;Christopher Rozell", "authorids": "marissa.connor@gatech.edu;crozell@gatech.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nconnor2018transfer,\ntitle={Transfer Learning on Manifolds via Learned Transport Operators},\nauthor={Marissa Connor and Christopher Rozell},\nyear={2018},\nurl={https://openreview.net/forum?id=rJL6pz-CZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rJL6pz-CZ", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6WWl_PvyunkJ:scholar.google.com/&scioq=Transfer+Learning+on+Manifolds+via+Learned+Transport+Operators&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rJLTTe-0W", "title": "Bayesian Time Series Forecasting with Change Point and Anomaly Detection", "track": "main", "status": "Reject", "tldr": "We propose a novel state space time series model with the capability to capture the structure of change points and anomaly points, so that it has a better forecasting performance when there exist change points and anomalies in the time series.", "abstract": "Time series forecasting plays a crucial role in marketing, finance and many other quantitative fields. A large amount of methodologies has been developed on this topic, including ARIMA, Holt\u2013Winters, etc. However, their performance is easily undermined by the existence of change points and anomaly points, two structures commonly observed in real data, but rarely considered in the aforementioned methods. In this paper, we propose a novel state space time series model, with the capability to capture the structure of change points and anomaly points, as well as trend and seasonality. To infer all the hidden variables, we develop a Bayesian framework, which is able to obtain distributions and forecasting intervals for time series forecasting, with provable theoretical properties. For implementation, an iterative algorithm with Markov chain Monte Carlo (MCMC), Kalman filter and Kalman smoothing is proposed. In both synthetic data and real data applications, our methodology yields a better performance in time series forecasting compared with existing methods, along with more accurate change point detection and anomaly detection.", "keywords": "Time Series Forecasting;Change Point Detection;Anomaly Detection;State Space Model;Bayesian", "primary_area": "", "supplementary_material": "", "author": "Anderson Y. Zhang;Miao Lu;Deguang Kong;Jimmy Yang", "authorids": "ye.zhang@yale.edu;mlu@oath.com;dkong@oath.com;jianyang@oath.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ny.2018bayesian,\ntitle={Bayesian Time Series Forecasting with Change Point and Anomaly Detection},\nauthor={Anderson Y. Zhang and Miao Lu and Deguang Kong and Jimmy Yang},\nyear={2018},\nurl={https://openreview.net/forum?id=rJLTTe-0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rJLTTe-0W", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;5;3", "rating_avg": 5.0, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10028885109982737058&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Flipout: Efficient Pseudo-Independent Weight Perturbations on Mini-Batches", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/35", "id": "rJNpifWAb", "author_site": "Yeming Wen, Paul Vicol, Jimmy Ba, Dustin Tran, Roger Grosse", "tldr": "We introduce flipout, an efficient method for decorrelating the gradients computed by stochastic neural net weights within a mini-batch by implicitly sampling pseudo-independent weight perturbations for each example.", "abstract": "Stochastic neural net weights are used in a variety of contexts, including regularization, Bayesian neural nets, exploration in reinforcement learning, and evolution strategies. Unfortunately, due to the large number of weights, all the examples in a mini-batch typically share the same weight perturbation, thereby limiting the variance reduction effect of large mini-batches. We introduce flipout, an efficient method for decorrelating the gradients within a mini-batch by implicitly sampling pseudo-independent weight perturbations for each example. Empirically, flipout achieves the ideal linear variance reduction for fully connected networks, convolutional networks, and RNNs. We find significant speedups in training neural networks with multiplicative Gaussian perturbations. We show that flipout is effective at regularizing LSTMs, and outperforms previous methods. Flipout also enables us to vectorize evolution strategies: in our experiments, a single GPU with flipout can handle the same throughput as at least 40 CPU cores using existing methods, equivalent to a factor-of-4 cost reduction on Amazon Web Services.", "keywords": "weight perturbation;reparameterization gradient;gradient variance reduction;evolution strategies;LSTM;regularization;optimization", "primary_area": "", "supplementary_material": "", "author": "Yeming Wen;Paul Vicol;Jimmy Ba;Dustin Tran;Roger Grosse", "authorids": "wenyemin@cs.toronto.edu;pvicol@cs.toronto.edu;jimmy@psi.toronto.edu;trandustin@google.com;rgrosse@cs.toronto.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nwen2018flipout,\ntitle={Flipout: Efficient Pseudo-Independent Weight Perturbations on Mini-Batches},\nauthor={Yeming Wen and Paul Vicol and Jimmy Ba and Dustin Tran and Roger Grosse},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rJNpifWAb},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=rJNpifWAb)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;8", "confidence": "4;4;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 5, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 418, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3775795413523094771&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=rJNpifWAb", "pdf": "https://openreview.net/pdf?id=rJNpifWAb", "email": ";;;;", "author_num": 5 }, { "title": "Unbiased Online Recurrent Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/184", "id": "rJQDjk-0b", "author_site": "Corentin Tallec, Yann Ollivier", "tldr": "Introduces an online, unbiased and easily implementable gradient estimate for recurrent models.", "abstract": "The novel \\emph{Unbiased Online Recurrent Optimization} (UORO) algorithm allows for online learning of general recurrent computational graphs such as recurrent network models. It works in a streaming fashion and avoids backtracking through past activations and inputs. UORO is computationally as costly as \\emph{Truncated Backpropagation Through Time} (truncated BPTT), a widespread algorithm for online learning of recurrent networks \\cite{jaeger2002tutorial}. UORO is a modification of \\emph{NoBackTrack} \\cite{DBLP:journals/corr/OllivierC15} that bypasses the need for model sparsity and makes implementation easy in current deep learning frameworks, even for complex models. Like NoBackTrack, UORO provides unbiased gradient estimates; unbiasedness is the core hypothesis in stochastic gradient descent theory, without which convergence to a local optimum is not guaranteed. On the contrary, truncated BPTT does not provide this property, leading to possible divergence. On synthetic tasks where truncated BPTT is shown to diverge, UORO converges. For instance, when a parameter has a positive short-term but negative long-term influence, truncated BPTT diverges unless the truncation span is very significantly longer than the intrinsic temporal range of the interactions, while UORO performs well thanks to the unbiasedness of its gradients.\n", "keywords": "RNN", "primary_area": "", "supplementary_material": "", "author": "Corentin Tallec;Yann Ollivier", "authorids": "corentin.tallec@polytechnique.edu;yann@yann-ollivier.org", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\ntallec2018unbiased,\ntitle={Unbiased Online Recurrent Optimization},\nauthor={Corentin Tallec and Yann Ollivier},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rJQDjk-0b},\n}", "github": "[![github](/images/github_icon.svg) ctallec/uoro](https://github.com/ctallec/uoro)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;4;5", "rating_avg": 7.0, "confidence_avg": 4.333333333333333, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 112, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3493841590728342658&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=rJQDjk-0b", "pdf": "https://openreview.net/pdf?id=rJQDjk-0b", "email": ";", "author_num": 2 }, { "id": "rJR2ylbRb", "title": "Spectral Graph Wavelets for Structural Role Similarity in Networks", "track": "main", "status": "Reject", "tldr": "We develop a method for learning structural signatures in networks based on the diffusion of spectral graph wavelets.", "abstract": "Nodes residing in different parts of a graph can have similar structural roles within their local network topology. The identification of such roles provides key insight into the organization of networks and can also be used to inform machine learning on graphs. However, learning structural representations of nodes is a challenging unsupervised-learning task, which typically involves manually specifying and tailoring topological features for each node. Here we develop GraphWave, a method that represents each node\u2019s local network neighborhood via a low-dimensional embedding by leveraging spectral graph wavelet diffusion patterns. We prove that nodes with similar local network neighborhoods will have similar GraphWave embeddings even though these nodes may reside in very different parts of the network. Our method scales linearly with the number of edges and does not require any hand-tailoring of topological features. We evaluate performance on both synthetic and real-world datasets, obtaining improvements of up to 71% over state-of-the-art baselines.", "keywords": "Graphs;Structural Similarities;Spectral Graph Wavelets;Graph Signal Processing;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Claire Donnat;Marinka Zitnik;David Hallac;Jure Leskovec", "authorids": "cdonnat@stanford.edu;marinka@cs.stanford.edu;hallac@stanford.edu;jure@cs.stanford.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ndonnat2018spectral,\ntitle={Spectral Graph Wavelets for Structural Role Similarity in Networks},\nauthor={Claire Donnat and Marinka Zitnik and David Hallac and Jure Leskovec},\nyear={2018},\nurl={https://openreview.net/forum?id=rJR2ylbRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rJR2ylbRb", "pdf_size": 0, "rating": "3;5;5", "confidence": "4;4;5", "rating_avg": 4.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3677399018300277011&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "rJSr0GZR-", "title": "Learning Priors for Adversarial Autoencoders", "track": "main", "status": "Reject", "tldr": "Learning Priors for Adversarial Autoencoders", "abstract": "Most deep latent factor models choose simple priors for simplicity, tractability\nor not knowing what prior to use. Recent studies show that the choice of\nthe prior may have a profound effect on the expressiveness of the model,\nespecially when its generative network has limited capacity. In this paper, we propose to learn a proper prior from data for adversarial autoencoders\n(AAEs). We introduce the notion of code generators to transform manually selected\nsimple priors into ones that can better characterize the data distribution. Experimental results show that the proposed model can generate better image quality and learn better disentangled representations than\nAAEs in both supervised and unsupervised settings. Lastly, we present its\nability to do cross-domain translation in a text-to-image synthesis task.", "keywords": "deep learning;computer vision;generative adversarial networks", "primary_area": "", "supplementary_material": "", "author": "Hui-Po Wang;Wei-Jan Ko;Wen-Hsiao Peng", "authorids": "a88575847@gmail.com;ts771164@gmail.com;wpeng@cs.nctu.edu.tw", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nwang2018learning,\ntitle={Learning Priors for Adversarial Autoencoders},\nauthor={Hui-Po Wang and Wei-Jan Ko and Wen-Hsiao Peng},\nyear={2018},\nurl={https://openreview.net/forum?id=rJSr0GZR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rJSr0GZR-", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;4;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16562901270379041310&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "rJTGkKxAZ", "title": "Learning Generative Models with Locally Disentangled Latent Factors", "track": "main", "status": "Reject", "tldr": "Decompose the task of learning a generative model into learning disentangled latent factors for subsets of the data and then learning the joint over those latent factors. ", "abstract": "One of the most successful techniques in generative models has been decomposing a complicated generation task into a series of simpler generation tasks. For example, generating an image at a low resolution and then learning to refine that into a high resolution image often improves results substantially. Here we explore a novel strategy for decomposing generation for complicated objects in which we first generate latent variables which describe a subset of the observed variables, and then map from these latent variables to the observed space. We show that this allows us to achieve decoupled training of complicated generative models and present both theoretical and experimental results supporting the benefit of such an approach. ", "keywords": "Generative Models;Hierarchical Models;Latent Variable Models", "primary_area": "", "supplementary_material": "", "author": "Brady Neal;Alex Lamb;Sherjil Ozair;Devon Hjelm;Aaron Courville;Yoshua Bengio;Ioannis Mitliagkas", "authorids": "nealb@seas.upenn.edu;alex6200@gmail.com;sherjilozair@gmail.com;erroneus@gmail.com;aaron.courville@gmail.com;yoshua.umontreal@gmail.com;imitliagkas@gmail.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nneal2018learning,\ntitle={Learning Generative Models with Locally Disentangled Latent Factors},\nauthor={Brady Neal and Alex Lamb and Sherjil Ozair and Devon Hjelm and Aaron Courville and Yoshua Bengio and Ioannis Mitliagkas},\nyear={2018},\nurl={https://openreview.net/forum?id=rJTGkKxAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJTGkKxAZ", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;4;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 7, "corr_rating_confidence": -0.944911182523068, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:DLuoZKMZghcJ:scholar.google.com/&scioq=Learning+Generative+Models+with+Locally+Disentangled+Latent+Factors&hl=en&as_sdt=0,14", "gs_version_total": 0 }, { "title": "On the insufficiency of existing momentum schemes for Stochastic Optimization", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/46", "id": "rJTutzbA-", "author_site": "Rahul Kidambi, Praneeth Netrapalli, Prateek Jain, Sham M Kakade", "tldr": "Existing momentum/acceleration schemes such as heavy ball method and Nesterov's acceleration employed with stochastic gradients do not improve over vanilla stochastic gradient descent, especially when employed with small batch sizes.", "abstract": "Momentum based stochastic gradient methods such as heavy ball (HB) and Nesterov's accelerated gradient descent (NAG) method are widely used in practice for training deep networks and other supervised learning models, as they often provide significant improvements over stochastic gradient descent (SGD). Rigorously speaking, fast gradient methods have provable improvements over gradient descent only for the deterministic case, where the gradients are exact. In the stochastic case, the popular explanations for their wide applicability is that when these fast gradient methods are applied in the stochastic case, they partially mimic their exact gradient counterparts, resulting in some practical gain. This work provides a counterpoint to this belief by proving that there exist simple problem instances where these methods cannot outperform SGD despite the best setting of its parameters. These negative problem instances are, in an informal sense, generic; they do not look like carefully constructed pathological instances. These results suggest (along with empirical evidence) that HB or NAG's practical performance gains are a by-product of minibatching.\n\nFurthermore, this work provides a viable (and provable) alternative, which, on the same set of problem instances, significantly improves over HB, NAG, and SGD's performance. This algorithm, referred to as Accelerated Stochastic Gradient Descent (ASGD), is a simple to implement stochastic algorithm, based on a relatively less popular variant of Nesterov's Acceleration. Extensive empirical results in this paper show that ASGD has performance gains over HB, NAG, and SGD. The code for implementing the ASGD Algorithm can be found at https://github.com/rahulkidambi/AccSGD.\n", "keywords": "Stochastic Gradient Descent;Deep Learning;Momentum;Acceleration;Heavy Ball;Nesterov Acceleration;Stochastic Optimization;SGD;Accelerated Stochastic Gradient Descent", "primary_area": "", "supplementary_material": "", "author": "Rahul Kidambi;Praneeth Netrapalli;Prateek Jain;Sham M. Kakade", "authorids": "rkidambi@uw.edu;praneeth@microsoft.com;prajain@microsoft.com;sham@cs.washington.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nkidambi2018on,\ntitle={On the insufficiency of existing momentum schemes for Stochastic Optimization},\nauthor={Rahul Kidambi and Praneeth Netrapalli and Prateek Jain and Sham M. Kakade},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rJTutzbA-},\n}", "github": "[![github](/images/github_icon.svg) rahulkidambi/AccSGD](https://github.com/rahulkidambi/AccSGD) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=rJTutzbA-)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "7;7;8", "confidence": "3;4;5", "rating_avg": 7.333333333333333, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 149, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6907311906014063619&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 15, "openreview": "https://openreview.net/forum?id=rJTutzbA-", "pdf": "https://openreview.net/pdf?id=rJTutzbA-", "email": ";;;", "author_num": 4 }, { "id": "rJUBryZ0W", "title": "Lifelong Learning by Adjusting Priors", "track": "main", "status": "Reject", "tldr": "We develop a lifelong learning approach to transfer learning based on PAC-Bayes theory, whereby priors are adjusted as new tasks are encountered thereby facilitating the learning of novel tasks.", "abstract": "In representational lifelong learning an agent aims to continually learn to solve novel tasks while updating its representation in light of previous tasks. Under the assumption that future tasks are related to previous tasks, representations should be learned in such a way that they capture the common structure across learned tasks, while allowing the learner sufficient flexibility to adapt to novel aspects of a new task. We develop a framework for lifelong learning in deep neural networks that is based on generalization bounds, developed within the PAC-Bayes framework. Learning takes place through the construction of a distribution over networks based on the tasks seen so far, and its utilization for learning a new task. Thus, prior knowledge is incorporated through setting a history-dependent prior for novel tasks. We develop a gradient-based algorithm implementing these ideas, based on minimizing an objective function motivated by generalization bounds, and demonstrate its effectiveness through numerical examples. ", "keywords": "Lifelong learning;Transfer learning;PAC-Bayes theory", "primary_area": "", "supplementary_material": "", "author": "Ron Amit;Ron Meir", "authorids": "ronamit@campus.technion.ac.il;rmeir@ee.technion.ac.il", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\namit2018lifelong,\ntitle={Lifelong Learning by Adjusting Priors},\nauthor={Ron Amit and Ron Meir},\nyear={2018},\nurl={https://openreview.net/forum?id=rJUBryZ0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJUBryZ0W", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XCvEM8pTHK4J:scholar.google.com/&scioq=Lifelong+Learning+by+Adjusting+Priors&hl=en&as_sdt=0,33", "gs_version_total": 2 }, { "title": "PixelDefend: Leveraging Generative Models to Understand and Defend against Adversarial Examples", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/163", "id": "rJUYGxbCW", "author_site": "Yang Song, Taesup Kim, Sebastian Nowozin, Stefano Ermon, Nate Kushman", "tldr": "", "abstract": "Adversarial perturbations of normal images are usually imperceptible to humans, but they can seriously confuse state-of-the-art machine learning models. What makes them so special in the eyes of image classifiers? In this paper, we show empirically that adversarial examples mainly lie in the low probability regions of the training distribution, regardless of attack types and targeted models. Using statistical hypothesis testing, we find that modern neural density models are surprisingly good at detecting imperceptible image perturbations. Based on this discovery, we devised PixelDefend, a new approach that purifies a maliciously perturbed image by moving it back towards the distribution seen in the training data. The purified image is then run through an unmodified classifier, making our method agnostic to both the classifier and the attacking method. As a result, PixelDefend can be used to protect already deployed models and be combined with other model-specific defenses. Experiments show that our method greatly improves resilience across a wide variety of state-of-the-art attacking methods, increasing accuracy on the strongest attack from 63% to 84% for Fashion MNIST and from 32% to 70% for CIFAR-10.", "keywords": "Adversarial Examples;Generative Models;Purification;Hypothesis Testing", "primary_area": "", "supplementary_material": "", "author": "Yang Song;Taesup Kim;Sebastian Nowozin;Stefano Ermon;Nate Kushman", "authorids": "yangsong@cs.stanford.edu;taesup.kim@umontreal.ca;sebastian.nowozin@microsoft.com;ermon@cs.stanford.edu;nkushman@microsoft.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nsong2018pixeldefend,\ntitle={PixelDefend: Leveraging Generative Models to Understand and Defend against Adversarial Examples},\nauthor={Yang Song and Taesup Kim and Sebastian Nowozin and Stefano Ermon and Nate Kushman},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rJUYGxbCW},\n}", "github": "[![github](/images/github_icon.svg) Microsoft/PixelDefend](https://github.com/Microsoft/PixelDefend)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;4;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 1019, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9269726813530152599&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=rJUYGxbCW", "pdf": "https://openreview.net/pdf?id=rJUYGxbCW", "email": ";;;;", "author_num": 5 }, { "id": "rJVruWZRW", "title": "Dense Recurrent Neural Network with Attention Gate", "track": "main", "status": "Reject", "tldr": "Dense RNN that has fully connections from each hidden state to multiple preceding hidden states of all layers directly.", "abstract": "We propose the dense RNN, which has the fully connections from each hidden state to multiple preceding hidden states of all layers directly. As the density of the connection increases, the number of paths through which the gradient flows can be increased. It increases the magnitude of gradients, which help to prevent the vanishing gradient problem in time. Larger gradients, however, can also cause exploding gradient problem. To complement the trade-off between two problems, we propose an attention gate, which controls the amounts of gradient flows. We describe the relation between the attention gate and the gradient flows by approximation. The experiment on the language modeling using Penn Treebank corpus shows dense connections with the attention gate improve the model\u2019s performance.", "keywords": "recurrent neural network;language modeling;dense connection", "primary_area": "", "supplementary_material": "", "author": "Yong-Ho Yoo;Kook Han;Sanghyun Cho;Kyoung-Chul Koh;Jong-Hwan Kim", "authorids": "yhyoo@rit.kaist.ac.kr;khan@rit.kaist.ac.kr;scho@rit.kaist.ac.kr;kckoh@rit.kaist.ac.kr;johkim@rit.kaist.ac.kr", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nyoo2018dense,\ntitle={Dense Recurrent Neural Network with Attention Gate},\nauthor={Yong-Ho Yoo and Kook Han and Sanghyun Cho and Kyoung-Chul Koh and Jong-Hwan Kim},\nyear={2018},\nurl={https://openreview.net/forum?id=rJVruWZRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJVruWZRW", "pdf_size": 0, "rating": "2;4;4", "confidence": "4;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nkmgqscnjsIJ:scholar.google.com/&scioq=Dense+Recurrent+Neural+Network+with+Attention+Gate&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Minimal-Entropy Correlation Alignment for Unsupervised Deep Domain Adaptation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/223", "id": "rJWechg0Z", "author_site": "Pietro Morerio, Jacopo Cavazza, Vittorio Murino", "tldr": "A new unsupervised deep domain adaptation technique which efficiently unifies correlation alignment and entropy minimization", "abstract": "In this work, we face the problem of unsupervised domain adaptation with a novel deep learning approach which leverages our finding that entropy minimization is induced by the optimal alignment of second order statistics between source and target domains. We formally demonstrate this hypothesis and, aiming at achieving an optimal alignment in practical cases, we adopt a more principled strategy which, differently from the current Euclidean approaches, deploys alignment along geodesics. Our pipeline can be implemented by adding to the standard classification loss (on the labeled source domain), a source-to-target regularizer that is weighted in an unsupervised and data-driven fashion. We provide extensive experiments to assess the superiority of our framework on standard domain and modality adaptation benchmarks.", "keywords": "unsupervised domain adaptation;entropy minimization;image classification;deep transfer learning", "primary_area": "", "supplementary_material": "", "author": "Pietro Morerio;Jacopo Cavazza;Vittorio Murino", "authorids": "pietro.morerio@iit.it;jacopo.cavazza@iit.it;vittorio.murino@iit.it", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nmorerio2018minimalentropy,\ntitle={Minimal-Entropy Correlation Alignment for Unsupervised Deep Domain Adaptation},\nauthor={Pietro Morerio and Jacopo Cavazza and Vittorio Murino},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rJWechg0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "5;5;4", "rating_avg": 7.0, "confidence_avg": 4.666666666666667, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 204, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8549844875925427078&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=rJWechg0Z", "pdf": "https://openreview.net/pdf?id=rJWechg0Z", "email": ";;", "author_num": 3 }, { "id": "rJWrK9lAb", "title": "Autoregressive Generative Adversarial Networks", "track": "main", "status": "Workshop", "tldr": "", "abstract": "Generative Adversarial Networks (GANs) learn a generative model by playing an adversarial game between a generator and an auxiliary discriminator, which classifies data samples vs. generated ones. However, it does not explicitly model feature co-occurrences in samples. In this paper, we propose a novel Autoregressive Generative Adversarial Network (ARGAN), that models the latent distribution of data using an autoregressive model, rather than relying on binary classification of samples into data/generated categories. In this way, feature co-occurrences in samples can be more efficiently captured. Our model was evaluated on two widely used datasets: CIFAR-10 and STL-10. Its performance is competitive with respect to other GAN models both quantitatively and qualitatively.", "keywords": "Generative Adversarial Networks;Latent Space Modeling", "primary_area": "", "supplementary_material": "", "author": "Yasin Yazici;Kim-Hui Yap;Stefan Winkler", "authorids": "yasin001@e.ntu.edu.sg;ekhyap@ntu.edu.sg;stefan.winkler@adsc.com.sg", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nyazici2018autoregressive,\ntitle={Autoregressive Generative Adversarial Networks},\nauthor={Yasin Yazici and Kim-Hui Yap and Stefan Winkler},\nyear={2018},\nurl={https://openreview.net/forum?id=rJWrK9lAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJWrK9lAb", "pdf_size": 0, "rating": "3;5;5", "confidence": "5;4;5", "rating_avg": 4.333333333333333, "confidence_avg": 4.666666666666667, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4766517190463868938&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "title": "Graph Attention Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/299", "id": "rJXMpikCZ", "author_site": "Petar Veli\u010dkovi\u0107, Guillem Cucurull Preixens, Arantxa Casanova Paga, Adriana Romero, Pietro Li\u00f2, Yoshua Bengio", "tldr": "A novel approach to processing graph-structured data by neural networks, leveraging attention over a node's neighborhood. Achieves state-of-the-art results on transductive citation network tasks and an inductive protein-protein interaction task.", "abstract": "We present graph attention networks (GATs), novel neural network architectures that operate on graph-structured data, leveraging masked self-attentional layers to address the shortcomings of prior methods based on graph convolutions or their approximations. By stacking layers in which nodes are able to attend over their neighborhoods' features, we enable (implicitly) specifying different weights to different nodes in a neighborhood, without requiring any kind of computationally intensive matrix operation (such as inversion) or depending on knowing the graph structure upfront. In this way, we address several key challenges of spectral-based graph neural networks simultaneously, and make our model readily applicable to inductive as well as transductive problems. Our GAT models have achieved or matched state-of-the-art results across four established transductive and inductive graph benchmarks: the Cora, Citeseer and Pubmed citation network datasets, as well as a protein-protein interaction dataset (wherein test graphs remain unseen during training).", "keywords": "Deep Learning;Graph Convolutions;Attention;Self-Attention", "primary_area": "", "supplementary_material": "", "author": "Petar Veli\u010dkovi\u0107;Guillem Cucurull;Arantxa Casanova;Adriana Romero;Pietro Li\u00f2;Yoshua Bengio", "authorids": "petar.velickovic@cst.cam.ac.uk;gcucurull@gmail.com;ar.casanova.8@gmail.com;adriana.romsor@gmail.com;pietro.lio@cst.cam.ac.uk;yoshua.umontreal@gmail.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nveli\u010dkovi\u01072018graph,\ntitle={Graph Attention Networks},\nauthor={Petar Veli\u010dkovi\u0107 and Guillem Cucurull and Arantxa Casanova and Adriana Romero and Pietro Li\u00f2 and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rJXMpikCZ},\n}", "github": "[![github](/images/github_icon.svg) PetarV-/GAT](https://github.com/PetarV-/GAT) + [![Papers with Code](/images/pwc_icon.svg) 89 community implementations](https://paperswithcode.com/paper/?openreview=rJXMpikCZ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;5", "rating_avg": 6.0, "confidence_avg": 4.333333333333333, "replies_avg": 22, "authors#_avg": 6, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 14887, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5609128480281463225&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=rJXMpikCZ", "pdf": "https://openreview.net/pdf?id=rJXMpikCZ", "email": ";;;;;", "author_num": 6 }, { "title": "Simulating Action Dynamics with Neural Process Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/85", "id": "rJYFzMZC-", "author_site": "Antoine Bosselut, Omer Levy, Ariel Holtzman, Corin Ennis, Dieter Fox, Yejin Choi", "tldr": "We propose a new recurrent memory architecture that can track common sense state changes of entities by simulating the causal effects of actions.", "abstract": "Understanding procedural language requires anticipating the causal effects of actions, even when they are not explicitly stated. In this work, we introduce Neural Process Networks to understand procedural text through (neural) simulation of action dynamics. Our model complements existing memory architectures with dynamic entity tracking by explicitly modeling actions as state transformers. The model updates the states of the entities by executing learned action operators. Empirical results demonstrate that our proposed model can reason about the unstated causal effects of actions, allowing it to provide more accurate contextual information for understanding and generating procedural text, all while offering more interpretable internal representations than existing alternatives.", "keywords": "representation learning;memory networks;state tracking", "primary_area": "", "supplementary_material": "", "author": "Antoine Bosselut;Omer Levy;Ari Holtzman;Corin Ennis;Dieter Fox;Yejin Choi", "authorids": "antoineb@cs.washington.edu;omerlevy@cs.washington.edu;ahai@cs.washington.edu;corin123@uw.edu;fox@cs.washington.edu;yejin@cs.washington.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nbosselut2018simulating,\ntitle={Simulating Action Dynamics with Neural Process Networks},\nauthor={Antoine Bosselut and Corin Ennis and Omer Levy and Ari Holtzman and Dieter Fox and Yejin Choi},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rJYFzMZC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;8;9", "confidence": "4;4;4", "rating_avg": 7.666666666666667, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 143, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15481167973154467970&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=rJYFzMZC-", "pdf": "https://openreview.net/pdf?id=rJYFzMZC-", "email": ";;;;;", "author_num": 6 }, { "id": "rJa90ceAb", "title": "Learning to Generate Filters for Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "dynamically generate filters conditioned on the input image for CNNs in each forward pass ", "abstract": "Conventionally, convolutional neural networks (CNNs) process different images with the same set of filters. However, the variations in images pose a challenge to this fashion. In this paper, we propose to generate sample-specific filters for convolutional layers in the forward pass. Since the filters are generated on-the-fly, the model becomes more flexible and can better fit the training data compared to traditional CNNs. In order to obtain sample-specific features, we extract the intermediate feature maps from an autoencoder. As filters are usually high dimensional, we propose to learn a set of coefficients instead of a set of filters. These coefficients are used to linearly combine the base filters from a filter repository to generate the final filters for a CNN. The proposed method is evaluated on MNIST, MTFL and CIFAR10 datasets. Experiment results demonstrate that the classification accuracy of the baseline model can be improved by using the proposed filter generation method.", "keywords": "filter generation;meta-learning;filter repository;image classification;dynamic generation", "primary_area": "", "supplementary_material": "", "author": "Wei Shen;Rujie Liu", "authorids": "shenwei@cn.fujitsu.com;rjliu@cn.fujitsu.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nshen2018learning,\ntitle={Learning to Generate Filters for Convolutional Neural Networks},\nauthor={Wei Shen and Rujie Liu},\nyear={2018},\nurl={https://openreview.net/forum?id=rJa90ceAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJa90ceAb", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 6, "authors#_avg": 2, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13546219281981487701&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "rJaE2alRW", "title": "Autoregressive Convolutional Neural Networks for Asynchronous Time Series", "track": "main", "status": "Reject", "tldr": "Convolutional architecture for learning data-dependent weights for autoregressive forecasting of time series.", "abstract": "We propose Significance-Offset Convolutional Neural Network, a deep convolutional network architecture for regression of multivariate asynchronous time series. The model is inspired by standard autoregressive (AR) models and gating mechanisms used in recurrent neural networks. It involves an AR-like weighting system, where the final predictor is obtained as a weighted sum of adjusted regressors, while the weights are data-dependent functions learnt through a convolutional network. The architecture was designed for applications on asynchronous time series and is evaluated on such datasets: a hedge fund proprietary dataset of over 2 million quotes for a credit derivative index, an artificially generated noisy autoregressive series and household electricity consumption dataset. The pro-posed architecture achieves promising results as compared to convolutional and recurrent neural networks. The code for the numerical experiments and the architecture implementation will be shared online to make the research reproducible.", "keywords": "neural networks;convolutional neural networks;time series;asynchronous data;regression", "primary_area": "", "supplementary_material": "", "author": "Mikolaj Binkowski;Gautier Marti;Philippe Donnat", "authorids": "mikbinkowski@gmail.com;gautier.marti@gmail.com;pdonnat@helleborecapital.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbinkowski2018autoregressive,\ntitle={Autoregressive Convolutional Neural Networks for Asynchronous Time Series},\nauthor={Mikolaj Binkowski and Gautier Marti and Philippe Donnat},\nyear={2018},\nurl={https://openreview.net/forum?id=rJaE2alRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJaE2alRW", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;3;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 215, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16946741031490973459&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "rJbs5gbRW", "title": "On the Generalization Effects of DenseNet Model Structures", "track": "main", "status": "Reject", "tldr": "Our paper analyses the tremendous representational power of networks especially with 'skip connections', which may be used as a method for better generalization.", "abstract": "Modern neural network architectures take advantage of increasingly deeper layers, and various advances in their structure to achieve better performance. While traditional explicit regularization techniques like dropout, weight decay, and data augmentation are still being used in these new models, little about the regularization and generalization effects of these new structures have been studied. \nBesides being deeper than their predecessors, could newer architectures like ResNet and DenseNet also benefit from their structures' implicit regularization properties? \nIn this work, we investigate the skip connection's effect on network's generalization features. Through experiments, we show that certain neural network architectures contribute to their generalization abilities. Specifically, we study the effect that low-level features have on generalization performance when they are introduced to deeper layers in DenseNet, ResNet as well as networks with 'skip connections'. We show that these low-level representations do help with generalization in multiple settings when both the quality and quantity of training data is decreased. ", "keywords": "Skip connection;generalization;gegularization;deep network;representation.", "primary_area": "", "supplementary_material": "", "author": "Yin Liu;Vincent Chen", "authorids": "liuyin14@mails.tsinghua.edu.cn;389091983@qq.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nliu2018on,\ntitle={On the Generalization Effects of DenseNet Model Structures },\nauthor={Yin Liu and Vincent Chen},\nyear={2018},\nurl={https://openreview.net/forum?id=rJbs5gbRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJbs5gbRW", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1768302310327667966&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "rJe7FW-Cb", "title": "A Painless Attention Mechanism for Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "We enhance CNNs with a novel attention mechanism for fine-grained recognition. Superior performance is obtained on 5 datasets.", "abstract": "We propose a novel attention mechanism to enhance Convolutional Neural Networks for fine-grained recognition. The proposed mechanism reuses CNN feature activations to find the most informative parts of the image at different depths with the help of gating mechanisms and without part annotations. Thus, it can be used to augment any layer of a CNN to extract low- and high-level local information to be more discriminative. \n\nDifferently, from other approaches, the mechanism we propose just needs a single pass through the input and it can be trained end-to-end through SGD. As a consequence, the proposed mechanism is modular, architecture-independent, easy to implement, and faster than iterative approaches.\n\nExperiments show that, when augmented with our approach, Wide Residual Networks systematically achieve superior performance on each of five different fine-grained recognition datasets: the Adience age and gender recognition benchmark, Caltech-UCSD Birds-200-2011, Stanford Dogs, Stanford Cars, and UEC Food-100, obtaining competitive and state-of-the-art scores.", "keywords": "computer vision;deep learning;convolutional neural networks;attention", "primary_area": "", "supplementary_material": "", "author": "Pau Rodr\u00edguez;Guillem Cucurull;Jordi Gonz\u00e0lez;Josep M. Gonfaus;Xavier Roca", "authorids": "pau.rodriguez@cvc.uab.es;;;;", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nrodr\u00edguez2018a,\ntitle={A Painless Attention Mechanism for Convolutional Neural Networks},\nauthor={Pau Rodr\u00edguez and Guillem Cucurull and Jordi Gonz\u00e0lez and Josep M. Gonfaus and Xavier Roca},\nyear={2018},\nurl={https://openreview.net/forum?id=rJe7FW-Cb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJe7FW-Cb", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 15, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17429475735814654262&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 2 }, { "id": "rJfHoM-C-", "title": "Few-Shot Learning with Simplex", "track": "main", "status": "Reject", "tldr": "A simplex-based geometric method is proposed to cope with few-shot learning problems.", "abstract": "Deep learning has made remarkable achievement in many fields. However, learning\nthe parameters of neural networks usually demands a large amount of labeled\ndata. The algorithms of deep learning, therefore, encounter difficulties when applied\nto supervised learning where only little data are available. This specific task\nis called few-shot learning. To address it, we propose a novel algorithm for fewshot\nlearning using discrete geometry, in the sense that the samples in a class are\nmodeled as a reduced simplex. The volume of the simplex is used for the measurement\nof class scatter. During testing, combined with the test sample and the\npoints in the class, a new simplex is formed. Then the similarity between the test\nsample and the class can be quantized with the ratio of volumes of the new simplex\nto the original class simplex. Moreover, we present an approach to constructing\nsimplices using local regions of feature maps yielded by convolutional neural networks.\nExperiments on Omniglot and miniImageNet verify the effectiveness of\nour simplex algorithm on few-shot learning.", "keywords": "One-shot learning;few-shot learning;deep learning;simplex", "primary_area": "", "supplementary_material": "", "author": "Bowen Zhang;Xifan Zhang;Fan Cheng;Deli Zhao", "authorids": "bwzhang@sjtu.edu.cn;xf-zh14@mails.tsinghua.edu.cn;chengfan85@gmail.com;zhaodeli@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhang2018fewshot,\ntitle={Few-Shot Learning with Simplex},\nauthor={Bowen Zhang and Xifan Zhang and Fan Cheng and Deli Zhao},\nyear={2018},\nurl={https://openreview.net/forum?id=rJfHoM-C-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJfHoM-C-", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9L2wkIfrheoJ:scholar.google.com/&scioq=Few-Shot+Learning+with+Simplex&hl=en&as_sdt=0,5", "gs_version_total": 4 }, { "id": "rJg4YGWRb", "title": "Attention-based Graph Neural Network for Semi-supervised Learning", "track": "main", "status": "Reject", "tldr": "We propose a novel attention-based interpretable Graph Neural Network architecture which outperforms the current state-of-the-art Graph Neural Networks in standard benchmark datasets", "abstract": "Recently popularized graph neural networks achieve the state-of-the-art accuracy on a number of standard benchmark datasets for graph-based semi-supervised learning, improving significantly over existing approaches. These architectures alternate between a propagation layer that aggregates the hidden states of the local neighborhood and a fully-connected layer. Perhaps surprisingly, we show that a linear model, that removes all the intermediate fully-connected layers, is still able to achieve a performance comparable to the state-of-the-art models. This significantly reduces the number of parameters, which is critical for semi-supervised learning where number of labeled examples are small. This in turn allows a room for designing more innovative propagation layers. Based on this insight, we propose a novel graph neural network that removes all the intermediate fully-connected layers, and replaces the propagation layers with attention mechanisms that respect the structure of the graph. The attention mechanism allows us to learn a dynamic and adaptive local summary of the neighborhood to achieve more accurate predictions. In a number of experiments on benchmark citation networks datasets, we demonstrate that our approach outperforms competing methods. By examining the attention weights among neighbors, we show that our model provides some interesting insights on how neighbors influence each other.", "keywords": "Graph Neural Network;Attention;Semi-supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Kiran K. Thekumparampil;Sewoong Oh;Chong Wang;Li-Jia Li", "authorids": "kirankoshy@gmail.com;sewoong79@gmail.com;chongw@google.com;lijiali@cs.stanford.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nk.2018attentionbased,\ntitle={Attention-based Graph Neural Network for Semi-supervised Learning},\nauthor={Kiran K. Thekumparampil and Sewoong Oh and Chong Wang and Li-Jia Li},\nyear={2018},\nurl={https://openreview.net/forum?id=rJg4YGWRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=rJg4YGWRb", "pdf_size": 0, "rating": "6;6;7", "confidence": "2;3;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.0, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 480, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11530753019909090241&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "rJhR_pxCZ", "title": "Interpretable Classification via Supervised Variational Autoencoders and Differentiable Decision Trees", "track": "main", "status": "Reject", "tldr": "We combine differentiable decision trees with supervised variational autoencoders to enhance interpretability of classification. ", "abstract": "As deep learning-based classifiers are increasingly adopted in real-world applications, the importance of understanding how a particular label is chosen grows. Single decision trees are an example of a simple, interpretable classifier, but are unsuitable for use with complex, high-dimensional data. On the other hand, the variational autoencoder (VAE) is designed to learn a factored, low-dimensional representation of data, but typically encodes high-likelihood data in an intrinsically non-separable way. We introduce the differentiable decision tree (DDT) as a modular component of deep networks and a simple, differentiable loss function that allows for end-to-end optimization of a deep network to compress high-dimensional data for classification by a single decision tree. We also explore the power of labeled data in a supervised VAE (SVAE) with a Gaussian mixture prior, which leverages label information to produce a high-quality generative model with improved bounds on log-likelihood. We combine the SVAE with the DDT to get our classifier+VAE (C+VAE), which is competitive in both classification error and log-likelihood, despite optimizing both simultaneously and using a very simple encoder/decoder architecture. ", "keywords": "interpretable classification;decision trees;deep learning;variational autoencoder", "primary_area": "", "supplementary_material": "", "author": "Eleanor Quint;Garrett Wirka;Jacob Williams;Stephen Scott;N.V. Vinodchandran", "authorids": "pquint@cse.unl.edu;gwirka@cse.unl.edu;jwilliam@cse.unl.edu;sscott@cse.unl.edu;vinod@cse.unl.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nquint2018interpretable,\ntitle={Interpretable Classification via Supervised Variational Autoencoders and Differentiable Decision Trees},\nauthor={Eleanor Quint and Garrett Wirka and Jacob Williams and Stephen Scott and N.V. Vinodchandran},\nyear={2018},\nurl={https://openreview.net/forum?id=rJhR_pxCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJhR_pxCZ", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;4;4", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 6, "authors#_avg": 5, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9212168688455252557&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "rJiaRbk0-", "title": "Towards Binary-Valued Gates for Robust LSTM Training", "track": "main", "status": "Reject", "tldr": "We propose a new algorithm for LSTM training by learning towards binary-valued gates which we shown has many nice properties.", "abstract": "Long Short-Term Memory (LSTM) is one of the most widely used recurrent structures in sequence modeling. Its goal is to use gates to control the information flow (e.g., whether to skip some information/transformation or not) in the recurrent computations, although its practical implementation based on soft gates only partially achieves this goal and is easy to overfit. In this paper, we propose a new way for LSTM training, which pushes the values of the gates towards 0 or 1. By doing so, we can (1) better control the information flow: the gates are mostly open or closed, instead of in a middle state; and (2) avoid overfitting to certain extent: the gates operate at their flat regions, which is shown to correspond to better generalization ability. However, learning towards discrete values of the gates is generally difficult. To tackle this challenge, we leverage the recently developed Gumbel-Softmax trick from the field of variational methods, and make the model trainable with standard backpropagation. Experimental results on language modeling and machine translation show that (1) the values of the gates generated by our method are more reasonable and intuitively interpretable, and (2) our proposed method generalizes better and achieves better accuracy on test sets in all tasks. Moreover, the learnt models are not sensitive to low-precision approximation and low-rank approximation of the gate parameters due to the flat loss surface.", "keywords": "recurrent neural network;LSTM;long-short term memory network;machine translation;generalization", "primary_area": "", "supplementary_material": "", "author": "Zhuohan Li;Di He;Fei Tian;Wei Chen;Tao Qin;Liwei Wang;Tie-Yan Liu", "authorids": "lizhuohan@pku.edu.cn;di_he@pku.edu.cn;fetia@microsoft.com;wche@microsoft.com;taoqin@microsoft.com;wanglw@cis.pku.edu.cn;tyliu@microsoft.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nhe2018towards,\ntitle={Towards Binary-Valued Gates for Robust {LSTM} Training },\nauthor={Di He and Zhuohan Li and Fei Tian and Wei Chen and Tao Qin and Liwei Wang and Tie-Yan Liu},\nyear={2018},\nurl={https://openreview.net/forum?id=rJiaRbk0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rJiaRbk0-", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 7, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 61, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9655995199891931380&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "rJk51gJRb", "title": "Adversarial Policy Gradient for Alternating Markov Games", "track": "main", "status": "Workshop", "tldr": "", "abstract": "Policy gradient reinforcement learning has been applied to two-player alternate-turn zero-sum games, e.g., in AlphaGo, self-play REINFORCE was used to improve the neural net model after supervised learning. In this paper, we emphasize that two-player zero-sum games with alternating turns, which have been previously formulated as Alternating Markov Games (AMGs), are different from standard MDP because of their two-agent nature. We exploit the difference in associated Bellman equations, which leads to different policy iteration algorithms. As policy gradient method is a kind of generalized policy iteration, we show how these differences in policy iteration are reflected in policy gradient for AMGs. We formulate an adversarial policy gradient and discuss potential possibilities for developing better policy gradient methods other than self-play REINFORCE. The core idea is to estimate the minimum rather than the mean for the \u201ccritic\u201d. Experimental results on the game of Hex show the modified Monte Carlo policy gradient methods are able to learn better pure neural net policies than the REINFORCE variants. To apply learned neural weights to multiple board sizes Hex, we describe a board-size independent neural net architecture. We show that when combined with search, using a single neural net model, the resulting program consistently beats MoHex 2.0, the state-of-the-art computer Hex player, on board sizes from 9\u00d79 to 13\u00d713. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chao Gao;Martin Mueller;Ryan Hayward", "authorids": "cgao3@ualberta.ca;mmueller@ualberta.ca;hayward@ualberta.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ngao2018adversarial,\ntitle={Adversarial Policy Gradient for Alternating Markov Games},\nauthor={Chao Gao and Martin Mueller and Ryan Hayward},\nyear={2018},\nurl={https://openreview.net/forum?id=rJk51gJRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJk51gJRb", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;4;2", "rating_avg": 5.0, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4441054404755805801&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "title": "Evidence Aggregation for Answer Re-Ranking in Open-Domain Question Answering", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/102", "id": "rJl3yM-Ab", "author_site": "Shuohang Wang, Mo Yu, Jing Jiang, Wei Zhang, Xiaoxiao Guo, Shiyu Chang, Zhiguo Wang, Tim Klinger, Gerald Tesauro, Murray Campbell", "tldr": "We propose a method that can make use of the multiple passages information for open-domain QA.", "abstract": "Very recently, it comes to be a popular approach for answering open-domain questions by first searching question-related passages, then applying reading comprehension models to extract answers. Existing works usually extract answers from single passages independently, thus not fully make use of the multiple searched passages, especially for the some questions requiring several evidences, which can appear in different passages, to be answered. The above observations raise the problem of evidence aggregation from multiple passages. In this paper, we deal with this problem as answer re-ranking. Specifically, based on the answer candidates generated from the existing state-of-the-art QA model, we propose two different re-ranking methods, strength-based and coverage-based re-rankers, which make use of the aggregated evidences from different passages to help entail the ground-truth answer for the question. Our model achieved state-of-the-arts on three public open-domain QA datasets, Quasar-T, SearchQA and the open-domain version of TriviaQA, with about 8\\% improvement on the former two datasets. ", "keywords": "Question Answering;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Shuohang Wang;Mo Yu;Jing Jiang;Wei Zhang;Xiaoxiao Guo;Shiyu Chang;Zhiguo Wang;Tim Klinger;Gerald Tesauro;Murray Campbell", "authorids": "shwang.2014@phdis.smu.edu.sg;yum@us.ibm.com;jingjiang@smu.edu.sg;zhangwei@us.ibm.com;xiaoxiao.guo@ibm.com;shiyu.chang@ibm.com;zhigwang@us.ibm.com;tklinger@us.ibm.com;gtesauro@us.ibm.com;mcam@us.ibm.com", "gender": ";;;;;;;;;", "homepage": ";;;;;;;;;", "dblp": ";;;;;;;;;", "google_scholar": ";;;;;;;;;", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;;;", "or_profile": ";;;;;;;;;", "aff": ";;;;;;;;;", "aff_domain": ";;;;;;;;;", "position": ";;;;;;;;;", "bibtex": "@inproceedings{\nwang2018evidence,\ntitle={Evidence Aggregation for Answer Re-Ranking in Open-Domain Question Answering},\nauthor={Shuohang Wang and Mo Yu and Jing Jiang and Wei Zhang and Xiaoxiao Guo and Shiyu Chang and Zhiguo Wang and Tim Klinger and Gerald Tesauro and Murray Campbell},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rJl3yM-Ab},\n}", "github": "[![github](/images/github_icon.svg) shuohangwang/mprc](https://github.com/shuohangwang/mprc)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;8", "confidence": "4;2;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.0, "replies_avg": 7, "authors#_avg": 10, "corr_rating_confidence": 0.0, "gs_citation": 207, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5917321946590508860&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=rJl3yM-Ab", "pdf": "https://openreview.net/pdf?id=rJl3yM-Ab", "email": ";;;;;;;;;", "author_num": 10 }, { "title": "Parametrized Hierarchical Procedures for Neural Programming", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/30", "id": "rJl63fZRb", "author_site": "Roy Fox, Richard Shin, Sanjay Krishnan, Ken Goldberg, Dawn Song, Ion Stoica", "tldr": "We introduce the PHP model for hierarchical representation of neural programs, and an algorithm for learning PHPs from a mixture of strong and weak supervision.", "abstract": "Neural programs are highly accurate and structured policies that perform algorithmic tasks by controlling the behavior of a computation mechanism. Despite the potential to increase the interpretability and the compositionality of the behavior of artificial agents, it remains difficult to learn from demonstrations neural networks that represent computer programs. The main challenges that set algorithmic domains apart from other imitation learning domains are the need for high accuracy, the involvement of specific structures of data, and the extremely limited observability. To address these challenges, we propose to model programs as Parametrized Hierarchical Procedures (PHPs). A PHP is a sequence of conditional operations, using a program counter along with the observation to select between taking an elementary action, invoking another PHP as a sub-procedure, and returning to the caller. We develop an algorithm for training PHPs from a set of supervisor demonstrations, only some of which are annotated with the internal call structure, and apply it to efficient level-wise training of multi-level PHPs. We show in two benchmarks, NanoCraft and long-hand addition, that PHPs can learn neural programs more accurately from smaller amounts of both annotated and unannotated demonstrations.", "keywords": "Neural programming;Hierarchical Control", "primary_area": "", "supplementary_material": "", "author": "Roy Fox;Richard Shin;Sanjay Krishnan;Ken Goldberg;Dawn Song;Ion Stoica", "authorids": "roy.d.fox@gmail.com;shin.richard@gmail.com;sanjay@eecs.berkeley.edu;goldberg@berkeley.edu;dawnsong.travel@gmail.com;istoica@cs.berkeley.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nfox2018parametrized,\ntitle={Parametrized Hierarchical Procedures for Neural Programming},\nauthor={Roy Fox and Richard Shin and Sanjay Krishnan and Ken Goldberg and Dawn Song and Ion Stoica},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rJl63fZRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;6;6", "confidence": "1;2;3", "rating_avg": 6.0, "confidence_avg": 2.0, "replies_avg": 12, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11070704689577142553&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=rJl63fZRb", "pdf": "https://openreview.net/pdf?id=rJl63fZRb", "email": ";;;;;", "author_num": 6 }, { "title": "Improving the Universality and Learnability of Neural Programmer-Interpreters with Combinator Abstraction", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/197", "id": "rJlMAAeC-", "author_site": "Da Xiao, Jo-Yu Liao, Xingyuan Yuan", "tldr": "", "abstract": "To overcome the limitations of Neural Programmer-Interpreters (NPI) in its universality and learnability, we propose the incorporation of combinator abstraction into neural programing and a new NPI architecture to support this abstraction, which we call Combinatory Neural Programmer-Interpreter (CNPI). Combinator abstraction dramatically reduces the number and complexity of programs that need to be interpreted by the core controller of CNPI, while still allowing the CNPI to represent and interpret arbitrary complex programs by the collaboration of the core with the other components. We propose a small set of four combinators to capture the most pervasive programming patterns. Due to the finiteness and simplicity of this combinator set and the offloading of some burden of interpretation from the core, we are able construct a CNPI that is universal with respect to the set of all combinatorizable programs, which is adequate for solving most algorithmic tasks. Moreover, besides supervised training on execution traces, CNPI can be trained by policy gradient reinforcement learning with appropriately designed curricula.", "keywords": "neural programming;Neural Programmer-Interpreter", "primary_area": "", "supplementary_material": "", "author": "Da Xiao;Jo-Yu Liao;Xingyuan Yuan", "authorids": "xiaoda99@gmail.com;liaoruoyu@caiyunapp.com;yuan@caiyunapp.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nxiao2018improving,\ntitle={Improving the Universality and Learnability of Neural Programmer-Interpreters with Combinator Abstraction},\nauthor={Da Xiao and Jo-Yu Liao and Xingyuan Yuan},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rJlMAAeC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "3;7;7", "confidence": "4;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15858127399451463236&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=rJlMAAeC-", "pdf": "https://openreview.net/pdf?id=rJlMAAeC-", "email": ";;", "author_num": 3 }, { "title": "Learning Parametric Closed-Loop Policies for Markov Potential Games", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/74", "id": "rJm7VfZA-", "author_site": "Sergio Valcarcel Macua, Javier Zazo, Santiago Zazo", "tldr": "We present general closed loop analysis for Markov potential games and show that deep reinforcement learning can be used for learning approximate closed-loop Nash equilibrium.", "abstract": "Multiagent systems where the agents interact among themselves and with an stochastic environment can be formalized as stochastic games. We study a subclass of these games, named Markov potential games (MPGs), that appear often in economic and engineering applications when the agents share some common resource. We consider MPGs with continuous state-action variables, coupled constraints and nonconvex rewards. Previous analysis followed a variational approach that is only valid for very simple cases (convex rewards, invertible dynamics, and no coupled constraints); or considered deterministic dynamics and provided open-loop (OL) analysis, studying strategies that consist in predefined action sequences, which are not optimal for stochastic environments. We present a closed-loop (CL) analysis for MPGs and consider parametric policies that depend on the current state and where agents adapt to stochastic transitions. We provide easily verifiable, sufficient and necessary conditions for a stochastic game to be an MPG, even for complex parametric functions (e.g., deep neural networks); and show that a closed-loop Nash equilibrium (NE) can be found (or at least approximated) by solving a related optimal control problem (OCP). This is useful since solving an OCP---which is a single-objective problem---is usually much simpler than solving the original set of coupled OCPs that form the game---which is a multiobjective control problem. This is a considerable improvement over the previously standard approach for the CL analysis of MPGs, which gives no approximate solution if no NE belongs to the chosen parametric family, and which is practical only for simple parametric forms. We illustrate the theoretical contributions with an example by applying our approach to a noncooperative communications engineering game. We then solve the game with a deep reinforcement learning algorithm that learns policies that closely approximates an exact variational NE of the game.", "keywords": "Stochastic games;potential games;closed loop;reinforcement learning;multiagent systems", "primary_area": "", "supplementary_material": "", "author": "Sergio Valcarcel Macua;Javier Zazo;Santiago Zazo", "authorids": "sergio@prowler.io;javier.zazo.ruiz@upm.es;santiago@gaps.ssr.upm.es", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nvalcarcel2018learning,\ntitle={Learning Parametric Closed-Loop Policies for Markov Potential Games},\nauthor={Sergio Valcarcel Macua and Javier Zazo and Santiago Zazo},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rJm7VfZA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;1;2", "rating_avg": 6.333333333333333, "confidence_avg": 2.0, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6885879364058847450&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=rJm7VfZA-", "pdf": "https://openreview.net/pdf?id=rJm7VfZA-", "email": ";;", "author_num": 3 }, { "id": "rJma2bZCW", "title": "Three factors influencing minima in SGD", "track": "main", "status": "Reject", "tldr": "Three factors (batch size, learning rate, gradient noise) change in predictable way the properties (e.g. sharpness) of minima found by SGD.", "abstract": "We study the statistical properties of the endpoint of stochastic gradient descent (SGD). We approximate SGD as a stochastic differential equation (SDE) and consider its Boltzmann Gibbs equilibrium distribution under the assumption of isotropic variance in loss gradients.. Through this analysis, we find that three factors \u2013 learning rate, batch size and the variance of the loss gradients \u2013 control the trade-off between the depth and width of the minima found by SGD, with wider minima favoured by a higher ratio of learning rate to batch size. In the equilibrium distribution only the ratio of learning rate to batch size appears, implying that it\u2019s invariant under a simultaneous rescaling of each by the same amount. \nWe experimentally show how learning rate and batch size affect SGD from two perspectives: the endpoint of SGD and the dynamics that lead up to it. For the endpoint, the experiments suggest the endpoint of SGD is similar under simultaneous rescaling of batch size and learning rate, and also that a higher ratio leads to flatter minima, both findings are consistent with our theoretical analysis. We note experimentally that the dynamics also seem to be similar under the same rescaling of learning rate and batch size, which we explore showing that one can exchange batch size and learning rate in a cyclical learning rate schedule. Next, we illustrate how noise affects memorization, showing that high noise levels lead to better generalization. Finally, we find experimentally that the similarity under simultaneous rescaling of learning rate and batch size breaks down if the learning rate gets too large or the batch size gets too small.", "keywords": "SGD;Deep Learning;Generalization", "primary_area": "", "supplementary_material": "", "author": "Stanis\u0142aw Jastrz\u0119bski;Zac Kenton;Devansh Arpit;Nicolas Ballas;Asja Fischer;Amos Storkey;Yoshua Bengio", "authorids": "staszek.jastrzebski@gmail.com;zakenton@gmail.com;devansh.arpit@umontreal.ca;ballas.n@gmail.com;asja.fischer@gmail.com;a.storkey@ed.ac.uk;yoshua.umontreal@gmail.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\njastrz\u0119bski2018three,\ntitle={Three factors influencing minima in {SGD}},\nauthor={Stanis\u0142aw Jastrz\u0119bski and Zac Kenton and Devansh Arpit and Nicolas Ballas and Asja Fischer and Amos Storkey and Yoshua Bengio},\nyear={2018},\nurl={https://openreview.net/forum?id=rJma2bZCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rJma2bZCW", "pdf_size": 0, "rating": "3;5;6", "confidence": "4;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 557, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5579830731629893992&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "rJoXrxZAZ", "title": "HybridNet: A Hybrid Neural Architecture to Speed-up Autoregressive Models", "track": "main", "status": "Reject", "tldr": "It is a hybrid neural architecture to speed-up autoregressive model. ", "abstract": "This paper introduces HybridNet, a hybrid neural network to speed-up autoregressive\nmodels for raw audio waveform generation. As an example, we propose\na hybrid model that combines an autoregressive network named WaveNet and a\nconventional LSTM model to address speech synthesis. Instead of generating\none sample per time-step, the proposed HybridNet generates multiple samples per\ntime-step by exploiting the long-term memory utilization property of LSTMs. In\nthe evaluation, when applied to text-to-speech, HybridNet yields state-of-art performance.\nHybridNet achieves a 3.83 subjective 5-scale mean opinion score on\nUS English, largely outperforming the same size WaveNet in terms of naturalness\nand provide 2x speed up at inference.", "keywords": "neural architecture;inference time reduction;hybrid model", "primary_area": "", "supplementary_material": "", "author": "Yanqi Zhou;Wei Ping;Sercan Arik;Kainan Peng;Greg Diamos", "authorids": "zhouyanqi@baidu.com;pingwei01@baidu.com;sercanarik@baidu.com;pengkainan@baidu.com;gregdiamos@baidu.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nzhou2018hybridnet,\ntitle={HybridNet: A Hybrid Neural Architecture to Speed-up Autoregressive Models},\nauthor={Yanqi Zhou and Wei Ping and Sercan Arik and Kainan Peng and Greg Diamos},\nyear={2018},\nurl={https://openreview.net/forum?id=rJoXrxZAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rJoXrxZAZ", "pdf_size": 0, "rating": "4;4;6", "confidence": "5;5;5", "rating_avg": 4.666666666666667, "confidence_avg": 5.0, "replies_avg": 7, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5725656242527015693&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "rJqfKPJ0Z", "title": "Clipping Free Attacks Against Neural Networks", "track": "main", "status": "Reject", "tldr": " In this paper, a new method we call Centered Initial Attack (CIA) is provided. It insures by construction the maximum perturbation to be smaller than a threshold fixed beforehand, without the clipping process.", "abstract": "During the last years, a remarkable breakthrough has been made in AI domain\nthanks to artificial deep neural networks that achieved a great success in many\nmachine learning tasks in computer vision, natural language processing, speech\nrecognition, malware detection and so on. However, they are highly vulnerable\nto easily crafted adversarial examples. Many investigations have pointed out this\nfact and different approaches have been proposed to generate attacks while adding\na limited perturbation to the original data. The most robust known method so far\nis the so called C&W attack [1]. Nonetheless, a countermeasure known as fea-\nture squeezing coupled with ensemble defense showed that most of these attacks\ncan be destroyed [6]. In this paper, we present a new method we call Centered\nInitial Attack (CIA) whose advantage is twofold : first, it insures by construc-\ntion the maximum perturbation to be smaller than a threshold fixed beforehand,\nwithout the clipping process that degrades the quality of attacks. Second, it is\nrobust against recently introduced defenses such as feature squeezing, JPEG en-\ncoding and even against a voting ensemble of defenses. While its application is\nnot limited to images, we illustrate this using five of the current best classifiers\non ImageNet dataset among which two are adversarialy retrained on purpose to\nbe robust against attacks. With a fixed maximum perturbation of only 1.5% on\nany pixel, around 80% of attacks (targeted) fool the voting ensemble defense and\nnearly 100% when the perturbation is only 6%. While this shows how it is difficult\nto defend against CIA attacks, the last section of the paper gives some guidelines\nto limit their impact.", "keywords": "Adversarial examples;Neural Networks;Clipping", "primary_area": "", "supplementary_material": "", "author": "Boussad ADDAD;Boussad ADDAD", "authorids": "boussad.addad@thalesgroup.com;boussad83@yahoo.fr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\naddad2018clipping,\ntitle={Clipping Free Attacks Against Neural Networks},\nauthor={Boussad ADDAD},\nyear={2018},\nurl={https://openreview.net/forum?id=rJqfKPJ0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rJqfKPJ0Z", "pdf_size": 0, "rating": "3;4;5", "confidence": "3;3;2", "rating_avg": 4.0, "confidence_avg": 2.6666666666666665, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:R6zMvswdV3cJ:scholar.google.com/&scioq=Clipping+Free+Attacks+Against+Neural+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "rJr4kfWCb", "title": "Lung Tumor Location and Identification with AlexNet and a Custom CNN", "track": "main", "status": "Reject", "tldr": "", "abstract": "Lung cancer is the leading cause of cancer deaths in the world and early detection is a crucial part of increasing patient survival. Deep learning techniques provide us with a method of automated analysis of patient scans. In this work, we compare AlexNet, a multi-layered and highly \ufb02exible architecture, with a custom CNN to determine if lung nodules with patient scans are benign or cancerous. We have found our CNN architecture to be highly accurate (99.79%) and fast while maintaining low False Positive and False Negative rates (< 0.01% and 0.15% respectively). This is important as high false positive rates are a serious issue with lung cancer diagnosis. We have found that AlexNet is not well suited to the problem of nodule identi\ufb01cation, though it is a good baseline comparison because of its \ufb02exibility.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Allison M Rossetto;Wenjin Zhou", "authorids": "allison_rossetto@student.uml.edu;wenjin_zhou@uml.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nm2018lung,\ntitle={Lung Tumor Location and Identification with AlexNet and a Custom {CNN}},\nauthor={Allison M Rossetto and Wenjin Zhou},\nyear={2018},\nurl={https://openreview.net/forum?id=rJr4kfWCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rJr4kfWCb", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5p0aLkXKlgsJ:scholar.google.com/&scioq=Lung+Tumor+Location+and+Identification+with+AlexNet+and+a+Custom+CNN&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "rJrTwxbCb", "title": "Empirical Analysis of the Hessian of Over-Parametrized Neural Networks", "track": "main", "status": "Workshop", "tldr": "The loss surface is *very* degenerate, and there are no barriers between large batch and small batch solutions.", "abstract": "We study the properties of common loss surfaces through their Hessian matrix. In particular, in the context of deep learning, we empirically show that the spectrum of the Hessian is composed of two parts: (1) the bulk centered near zero, (2) and outliers away from the bulk. We present numerical evidence and mathematical justifications to the following conjectures laid out by Sagun et. al. (2016): Fixing data, increasing the number of parameters merely scales the bulk of the spectrum; fixing the dimension and changing the data (for instance adding more clusters or making the data less separable) only affects the outliers. We believe that our observations have striking implications for non-convex optimization in high dimensions. First, the *flatness* of such landscapes (which can be measured by the singularity of the Hessian) implies that classical notions of basins of attraction may be quite misleading. And that the discussion of wide/narrow basins may be in need of a new perspective around over-parametrization and redundancy that are able to create *large* connected components at the bottom of the landscape. Second, the dependence of a small number of large eigenvalues to the data distribution can be linked to the spectrum of the covariance matrix of gradients of model outputs. With this in mind, we may reevaluate the connections within the data-architecture-algorithm framework of a model, hoping that it would shed light on the geometry of high-dimensional and non-convex spaces in modern applications. In particular, we present a case that links the two observations: small and large batch gradient descent appear to converge to different basins of attraction but we show that they are in fact connected through their flat region and so belong to the same basin.", "keywords": "Deep Learning;Over-parametrization;Hessian;Eigenvalues;Flat minima;Large batch Small batch", "primary_area": "", "supplementary_material": "", "author": "Levent Sagun;Utku Evci;V. Ugur Guney;Yann Dauphin;Leon Bottou", "authorids": "leventsagun@gmail.com;ue225@nyu.edu;vug@fb.com;yann@dauphin.io;leonb@fb.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nsagun2018empirical,\ntitle={Empirical Analysis of the Hessian of Over-Parametrized Neural Networks},\nauthor={Levent Sagun and Utku Evci and V. Ugur Guney and Yann Dauphin and Leon Bottou},\nyear={2018},\nurl={https://openreview.net/forum?id=rJrTwxbCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJrTwxbCb", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;2", "rating_avg": 4.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 11, "authors#_avg": 5, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 435, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17796454990684335723&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "rJssAZ-0-", "title": "TRL: Discriminative Hints for Scalable Reverse Curriculum Learning", "track": "main", "status": "Reject", "tldr": "We propose Tendency RL to efficiently solve goal-oriented tasks with large state space using automated curriculum learning and discriminative shaping reward, which has the potential to tackle robot manipulation tasks with perception.", "abstract": "Deep reinforcement learning algorithms have proven successful in a variety of domains. However, tasks with sparse rewards remain challenging when the state space is large. Goal-oriented tasks are among the most typical problems in this domain, where a reward can only be received when the final goal is accomplished. In this work, we propose a potential solution to such problems with the introduction of an experience-based tendency reward mechanism, which provides the agent with additional hints based on a discriminative learning on past experiences during an automated reverse curriculum. This mechanism not only provides dense additional learning signals on what states lead to success, but also allows the agent to retain only this tendency reward instead of the whole histories of experience during multi-phase curriculum learning. We extensively study the advantages of our method on the standard sparse reward domains like Maze and Super Mario Bros and show that our method performs more efficiently and robustly than prior approaches in tasks with long time horizons and large state space. In addition, we demonstrate that using an optional keyframe scheme with very small quantity of key states, our approach can solve difficult robot manipulation challenges directly from perception and sparse rewards.", "keywords": "deep learning;deep reinforcement learning;robotics;perception", "primary_area": "", "supplementary_material": "", "author": "Chen Wang;Xiangyu Chen;Zelin Ye;Jialu Wang;Ziruo Cai;Shixiang Gu;Cewu Lu", "authorids": "jere.wang@sjtu.edu.cn;cxy_1997@sjtu.edu.cn;h_e_r_o@sjtu.edu.cn;faldict@sjtu.edu.cn;sjtu_caiziruo@sjtu.edu.cn;sg717@cam.ac.uk;lucewu@sjtu.edu.cn", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nwang2018trl,\ntitle={{TRL}: Discriminative Hints for Scalable Reverse Curriculum Learning},\nauthor={Chen Wang and Xiangyu Chen and Zelin Ye and Jialu Wang and Ziruo Cai and Shixiang Gu and Cewu Lu},\nyear={2018},\nurl={https://openreview.net/forum?id=rJssAZ-0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rJssAZ-0-", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:mNXpFxSr0fAJ:scholar.google.com/&scioq=TRL:+Discriminative+Hints+for+Scalable+Reverse+Curriculum+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rJv4XWZA-", "title": "Generating Differentially Private Datasets Using GANs", "track": "main", "status": "Reject", "tldr": "Train GANs with differential privacy to generate artificial privacy-preserving datasets.", "abstract": "In this paper, we present a technique for generating artificial datasets that retain statistical properties of the real data while providing differential privacy guarantees with respect to this data. We include a Gaussian noise layer in the discriminator of a generative adversarial network to make the output and the gradients differentially private with respect to the training data, and then use the generator component to synthesise privacy-preserving artificial dataset. Our experiments show that under a reasonably small privacy budget we are able to generate data of high quality and successfully train machine learning models on this artificial data.", "keywords": "generative adversarial networks;differential privacy;synthetic data", "primary_area": "", "supplementary_material": "", "author": "Aleksei Triastcyn;Boi Faltings", "authorids": "aleksei.triastcyn@epfl.ch;boi.faltings@epfl.ch", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntriastcyn2018generating,\ntitle={Generating Differentially Private Datasets Using {GAN}s},\nauthor={Aleksei Triastcyn and Boi Faltings},\nyear={2018},\nurl={https://openreview.net/forum?id=rJv4XWZA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rJv4XWZA-", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10085965372632022122&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "An efficient framework for learning sentence representations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/129", "id": "rJvJXZb0W", "author_site": "Lajanugen Logeswaran, Honglak Lee", "tldr": "A framework for learning high-quality sentence representations efficiently.", "abstract": "In this work we propose a simple and efficient framework for learning sentence representations from unlabelled data. Drawing inspiration from the distributional hypothesis and recent work on learning sentence representations, we reformulate the problem of predicting the context in which a sentence appears as a classification problem. Given a sentence and the context in which it appears, a classifier distinguishes context sentences from other contrastive sentences based on their vector representations. This allows us to efficiently learn different types of encoding functions, and we show that the model learns high-quality sentence representations. We demonstrate that our sentence representations outperform state-of-the-art unsupervised and supervised representation learning methods on several downstream NLP tasks that involve understanding sentence semantics while achieving an order of magnitude speedup in training time.", "keywords": "sentence;embeddings;unsupervised;representations;learning;efficient", "primary_area": "", "supplementary_material": "", "author": "Lajanugen Logeswaran;Honglak Lee", "authorids": "llajan@umich.edu;honglak@eecs.umich.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nlogeswaran2018an,\ntitle={An efficient framework for learning sentence representations},\nauthor={Lajanugen Logeswaran and Honglak Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rJvJXZb0W},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 6 community implementations](https://paperswithcode.com/paper/?openreview=rJvJXZb0W)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;8", "confidence": "5;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 15, "authors#_avg": 2, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 732, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12944665931993057404&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=rJvJXZb0W", "pdf": "https://openreview.net/pdf?id=rJvJXZb0W", "email": ";", "author_num": 2 }, { "title": "Divide-and-Conquer Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/100", "id": "rJwelMbR-", "author_site": "Dibya Ghosh, Avi Singh, Aravind Rajeswaran, Vikash Kumar, Sergey Levine", "tldr": "", "abstract": "Standard model-free deep reinforcement learning (RL) algorithms sample a new initial state for each trial, allowing them to optimize policies that can perform well even in highly stochastic environments. However, problems that exhibit considerable initial state variation typically produce high-variance gradient estimates for model-free RL, making direct policy or value function optimization challenging. In this paper, we develop a novel algorithm that instead partitions the initial state space into \"slices\", and optimizes an ensemble of policies, each on a different slice. The ensemble is gradually unified into a single policy that can succeed on the whole state space. This approach, which we term divide-and-conquer RL, is able to solve complex tasks where conventional deep RL methods are ineffective. Our results show that divide-and-conquer RL greatly outperforms conventional policy gradient methods on challenging grasping, manipulation, and locomotion tasks, and exceeds the performance of a variety of prior methods. Videos of policies learned by our algorithm can be viewed at https://sites.google.com/view/dnc-rl/\n", "keywords": "deep reinforcement learning;reinforcement learning;policy gradients;model-free", "primary_area": "", "supplementary_material": "", "author": "Dibya Ghosh;Avi Singh;Aravind Rajeswaran;Vikash Kumar;Sergey Levine", "authorids": "dibya.ghosh@berkeley.edu;avisingh@cs.berkeley.edu;aravraj@cs.washington.edu;vikash@cs.washington.edu;svlevine@eecs.berkeley.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nghosh2018divideandconquer,\ntitle={Divide-and-Conquer Reinforcement Learning},\nauthor={Dibya Ghosh and Avi Singh and Aravind Rajeswaran and Vikash Kumar and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rJwelMbR-},\n}", "github": "[![github](/images/github_icon.svg) dibyaghosh/dnc](https://github.com/dibyaghosh/dnc)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "4;7;7", "confidence": "4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 136, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8527540948926777430&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=rJwelMbR-", "pdf": "https://openreview.net/pdf?id=rJwelMbR-", "email": ";;;;", "author_num": 5 }, { "title": "Towards Deep Learning Models Resistant to Adversarial Attacks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/67", "id": "rJzIBfZAb", "author_site": "Aleksander Madry, Aleksandar A Makelov, Ludwig Schmidt, Dimitris Tsipras, Adrian Vladu", "tldr": "We provide a principled, optimization-based re-look at the notion of adversarial examples, and develop methods that produce models that are adversarially robust against a wide range of adversaries.", "abstract": "Recent work has demonstrated that neural networks are vulnerable to adversarial examples, i.e., inputs that are almost indistinguishable from natural data and yet classified incorrectly by the network. To address this problem, we study the adversarial robustness of neural networks through the lens of robust optimization. This approach provides us with a broad and unifying view on much prior work on this topic. Its principled nature also enables us to identify methods for both training and attacking neural networks that are reliable and, in a certain sense, universal. In particular, they specify a concrete security guarantee that would protect against a well-defined class of adversaries. These methods let us train networks with significantly improved resistance to a wide range of adversarial attacks. They also suggest robustness against a first-order adversary as a natural security guarantee. We believe that robustness against such well-defined classes of adversaries is an important stepping stone towards fully resistant deep learning models.", "keywords": "adversarial examples;robust optimization;ML security", "primary_area": "", "supplementary_material": "", "author": "Aleksander Madry;Aleksandar Makelov;Ludwig Schmidt;Dimitris Tsipras;Adrian Vladu", "authorids": "madry@mit.edu;amakelov@mit.edu;ludwigs@mit.edu;tsipras@mit.edu;avladu@mit.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nmadry2018towards,\ntitle={Towards Deep Learning Models Resistant to Adversarial Attacks},\nauthor={Aleksander Madry and Aleksandar Makelov and Ludwig Schmidt and Dimitris Tsipras and Adrian Vladu},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rJzIBfZAb},\n}", "github": "[![github](/images/github_icon.svg) MadryLab/mnist_challenge](https://github.com/MadryLab/mnist_challenge) + [![Papers with Code](/images/pwc_icon.svg) 56 community implementations](https://paperswithcode.com/paper/?openreview=rJzIBfZAb)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 14, "authors#_avg": 5, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 15225, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14165082781627851489&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=rJzIBfZAb", "pdf": "https://openreview.net/pdf?id=rJzIBfZAb", "email": ";;;;", "author_num": 5 }, { "title": "Learning an Embedding Space for Transferable Robot Skills", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/6", "id": "rk07ZXZRb", "author_site": "Karol Hausman, Jost Tobias Springenberg, ziyu wang, Nicolas Heess, Martin Riedmiller", "tldr": "", "abstract": "We present a method for reinforcement learning of closely related skills that are parameterized via a skill embedding space. We learn such skills by taking advantage of latent variables and exploiting a connection between reinforcement learning and variational inference. The main contribution of our work is an entropy-regularized policy gradient formulation for hierarchical policies, and an associated, data-efficient and robust off-policy gradient algorithm based on stochastic value gradients. We demonstrate the effectiveness of our method on several simulated robotic manipulation tasks. We find that our method allows for discovery of multiple solutions and is capable of learning the minimum number of distinct skills that are necessary to solve a given set of tasks. In addition, our results indicate that the hereby proposed technique can interpolate and/or sequence previously learned skills in order to accomplish more complex tasks, even in the presence of sparse rewards.\n", "keywords": "Deep Reinforcement Learning;Variational Inference;Control;Robotics", "primary_area": "", "supplementary_material": "", "author": "Karol Hausman;Jost Tobias Springenberg;Ziyu Wang;Nicolas Heess;Martin Riedmiller", "authorids": "hausmankarol@gmail.com;springenberg@google.com;ziyu@google.com;heess@google.com;riedmiller@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nhausman2018learning,\ntitle={Learning an Embedding Space for Transferable Robot Skills},\nauthor={Karol Hausman and Jost Tobias Springenberg and Ziyu Wang and Nicolas Heess and Martin Riedmiller},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rk07ZXZRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "7;7;7", "confidence": "5;4;4", "rating_avg": 7.0, "confidence_avg": 4.333333333333333, "replies_avg": 10, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 365, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1816310193271208067&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=rk07ZXZRb", "pdf": "https://openreview.net/pdf?id=rk07ZXZRb", "email": ";;;;", "author_num": 5 }, { "id": "rk1FQA0pW", "title": "End-to-End Abnormality Detection in Medical Imaging", "track": "main", "status": "Reject", "tldr": "Detection of lung nodule starting from projection data rather than images.", "abstract": "Deep neural networks (DNN) have shown promising performance in computer vision. In medical imaging, encouraging results have been achieved with deep learning for applications such as segmentation, lesion detection and classification. Nearly all of the deep learning based image analysis methods work on reconstructed images, which are obtained from original acquisitions via solving inverse problems (reconstruction). The reconstruction algorithms are designed for human observers, but not necessarily optimized for DNNs which can often observe features that are incomprehensible for human eyes. Hence, it is desirable to train the DNNs directly from the original data which lie in a different domain with the images. In this paper, we proposed an end-to-end DNN for abnormality detection in medical imaging. To align the acquisition with the annotations made by radiologists in the image domain, a DNN was built as the unrolled version of iterative reconstruction algorithms to map the acquisitions to images, and followed by a 3D convolutional neural network (CNN) to detect the abnormality in the reconstructed images. The two networks were trained jointly in order to optimize the entire DNN for the detection task from the original acquisitions. The DNN was implemented for lung nodule detection in low-dose chest computed tomography (CT), where a numerical simulation was done to generate acquisitions from 1,018 chest CT images with radiologists' annotations. The proposed end-to-end DNN demonstrated better sensitivity and accuracy for the task compared to a two-step approach, in which the reconstruction and detection DNNs were trained separately. A significant reduction of false positive rate on suspicious lesions were observed, which is crucial for the known over-diagnosis in low-dose lung CT imaging. The images reconstructed by the proposed end-to-end network also presented enhanced details in the region of interest. ", "keywords": "End-to-End training;deep neural networks;medical imaging;image reconstruction", "primary_area": "", "supplementary_material": "", "author": "Dufan Wu;Kyungsang Kim;Bin Dong;Quanzheng Li", "authorids": "dwu6@mgh.harvard.edu;kkim24@mgh.harvard.edu;dongbin@math.pku.edu.cn;li.quanzheng@mgh.harvard.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nwu2018endtoend,\ntitle={End-to-End Abnormality Detection in Medical Imaging},\nauthor={Dufan Wu and Kyungsang Kim and Bin Dong and Quanzheng Li},\nyear={2018},\nurl={https://openreview.net/forum?id=rk1FQA0pW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rk1FQA0pW", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;3", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7677798658801129572&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "rk1J969Xz", "title": "MULTI-MODAL GEOLOCATION ESTIMATION USING DEEP NEURAL NETWORKS", "track": "main", "status": "Withdraw", "tldr": "A global geolocation inferencing strategy with novel meshing strategy and demonstrating incorporating additional information can be used to improve the overall performance of a geolocation inference model.", "abstract": "Estimating the location where an image was taken based solely on the contents of the image is a challenging task, even for humans, as properly labeling an image in such a fashion relies heavily on contextual information, and is not as simple as identifying a single object in the image. Thus any methods which attempt to do so must somehow account for these complexities, and no single model to date is completely capable of addressing all challenges. This work contributes to the state of research in image geolocation inferencing by introducing a novel global meshing strategy, outlining a variety of training procedures to overcome the considerable data limitations when training these models, and demonstrating how incorporating additional information can be used to improve the overall performance of a geolocation inference model. In this work, it is shown that Delaunay triangles are an effective type of mesh for geolocation in relatively low volume scenarios when compared to results from state of the art models which use quad trees and an order of magnitude more training data. In addition, the time of posting, learned user albuming, and other meta data are easily incorporated to improve geolocation by up to 11% for country-level (750 km) locality accuracy to 3% for city-level (25 km) localities.\n", "keywords": "deep neural networks;geolocation;inception;long-short term memory networks;social media applications", "primary_area": "", "supplementary_material": "", "author": "Jesse Johns;Jeremiah Rounds;Michael Henry", "authorids": "jesse.johns@pnnl.gov;jeremiah.rounds@pnnl.gov;michael.j.henry@pnnl.gov", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rk1J969Xz", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.0, "replies_avg": 3, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10106741925364932376&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "rk3b2qxCW", "title": "Policy Gradient For Multidimensional Action Spaces: Action Sampling and Entropy Bonus", "track": "main", "status": "Reject", "tldr": "policy parameterizations and unbiased policy entropy estimators for MDP with large multidimensional discrete action space", "abstract": "In recent years deep reinforcement learning has been shown to be adept at solving sequential decision processes with high-dimensional state spaces such as in the Atari games. Many reinforcement learning problems, however, involve high-dimensional discrete action spaces as well as high-dimensional state spaces. In this paper, we develop a novel policy gradient methodology for the case of large multidimensional discrete action spaces. We propose two approaches for creating parameterized policies: LSTM parameterization and a Modified MDP (MMDP) giving rise to Feed-Forward Network (FFN) parameterization. Both of these approaches provide expressive models to which backpropagation can be applied for training. We then consider entropy bonus, which is typically added to the reward function to enhance exploration. In the case of high-dimensional action spaces, calculating the entropy and the gradient of the entropy requires enumerating all the actions in the action space and running forward and backpropagation for each action, which may be computationally infeasible. We develop several novel unbiased estimators for the entropy bonus and its gradient. Finally, we test our algorithms on two environments: a multi-hunter multi-rabbit grid game and a multi-agent multi-arm bandit problem.", "keywords": "deep reinforcement learning;policy gradient;multidimensional action space;entropy bonus;entropy regularization;discrete action space", "primary_area": "", "supplementary_material": "", "author": "Vuong Ho Quan;Yiming Zhang;Kenny Song;Xiao-Yue Gong;Keith W. Ross", "authorids": "quan.hovuong@gmail.com;yiming.zhang@nyu.edu;kenny.song@nyu.edu;xygong@mit.edu;keithwross@nyu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nho2018policy,\ntitle={Policy Gradient For Multidimensional Action Spaces: Action Sampling and Entropy Bonus},\nauthor={Vuong Ho Quan and Yiming Zhang and Kenny Song and Xiao-Yue Gong and Keith W. Ross},\nyear={2018},\nurl={https://openreview.net/forum?id=rk3b2qxCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rk3b2qxCW", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;5;3", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 5, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Ssqet5i-Q6YJ:scholar.google.com/&scioq=Policy+Gradient+For+Multidimensional+Action+Spaces:+Action+Sampling+and+Entropy+Bonus&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "rk3mjYRp-", "title": "Diffusing Policies : Towards Wasserstein Policy Gradient Flows", "track": "main", "status": "Reject", "tldr": "Linking Wasserstein-trust region entropic policy gradients, and the heat equation.", "abstract": "Policy gradients methods often achieve better performance when the change in policy is limited to a small Kullback-Leibler divergence. We derive policy gradients where the change in policy is limited to a small Wasserstein distance (or trust region). This is done in the discrete and continuous multi-armed bandit settings with entropy regularisation. We show that in the small steps limit with respect to the Wasserstein distance $W_2$, policy dynamics are governed by the heat equation, following the Jordan-Kinderlehrer-Otto result. This means that policies undergo diffusion and advection, concentrating near actions with high reward. This helps elucidate the nature of convergence in the probability matching setup, and provides justification for empirical practices such as Gaussian policy priors and additive gradient noise.", "keywords": "Optimal transport;policy gradients;entropy regularization;reinforcement learning;heat equation;Wasserstein;JKO;gradient flows", "primary_area": "", "supplementary_material": "", "author": "Pierre H. Richemond;Brendan Maginnis", "authorids": "phr17@imperial.ac.uk;b.maginnis@imperial.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nh.2018diffusing,\ntitle={Diffusing Policies : Towards Wasserstein Policy Gradient Flows},\nauthor={Pierre H. Richemond and Brendan Maginnis},\nyear={2018},\nurl={https://openreview.net/forum?id=rk3mjYRp-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=rk3mjYRp-", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;3;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BhMLfUKrpEUJ:scholar.google.com/&scioq=Diffusing+Policies+:+Towards+Wasserstein+Policy+Gradient+Flows&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rk3pnae0b", "title": "Topic-Based Question Generation", "track": "main", "status": "Workshop", "tldr": "We propose a neural network that is able to generate topic-specific questions.", "abstract": "Asking questions is an important ability for a chatbot. This paper focuses on question generation. Although there are existing works on question generation based on a piece of descriptive text, it remains to be a very challenging problem. In the paper, we propose a new question generation problem, which also requires the input of a target topic in addition to a piece of descriptive text. The key reason for proposing the new problem is that in practical applications, we found that useful questions need to be targeted toward some relevant topics. One almost never asks a random question in a conversation. Due to the fact that given a descriptive text, it is often possible to ask many types of questions, generating a question without knowing what it is about is of limited use. To solve the problem, we propose a novel neural network that is able to generate topic-specific questions. One major advantage of this model is that it can be trained directly using a question-answering corpus without requiring any additional annotations like annotating topics in the questions or answers. Experimental results show that our model outperforms the state-of-the-art baseline.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenpeng Hu;Bing Liu;Rui Yan;Dongyan Zhao;Jinwen Ma", "authorids": "wenpeng.hu@pku.edu.cn;liub@cs.uic.edu;ruiyan@pku.edu.cn;zhaody@pku.edu.cn;jwma@math.pku.edu.cn", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nhu2018topicbased,\ntitle={Topic-Based Question Generation},\nauthor={Wenpeng Hu and Bing Liu and Rui Yan and Dongyan Zhao and Jinwen Ma},\nyear={2018},\nurl={https://openreview.net/forum?id=rk3pnae0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rk3pnae0b", "pdf_size": 0, "rating": "3;4;8", "confidence": "5;4;3", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 5, "corr_rating_confidence": -0.944911182523068, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3396280879557052203&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Stochastic Variational Video Prediction", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/162", "id": "rk49Mg-CW", "author_site": "Mohammad Babaeizadeh, Chelsea Finn, Dumitru Erhan, Roy H Campbell, Sergey Levine", "tldr": "Stochastic variational video prediction in real-world settings.", "abstract": "Predicting the future in real-world settings, particularly from raw sensory observations such as images, is exceptionally challenging. Real-world events can be stochastic and unpredictable, and the high dimensionality and complexity of natural images requires the predictive model to build an intricate understanding of the natural world. Many existing methods tackle this problem by making simplifying assumptions about the environment. One common assumption is that the outcome is deterministic and there is only one plausible future. This can lead to low-quality predictions in real-world settings with stochastic dynamics. In this paper, we develop a stochastic variational video prediction (SV2P) method that predicts a different possible future for each sample of its latent variables. To the best of our knowledge, our model is the first to provide effective stochastic multi-frame prediction for real-world video. We demonstrate the capability of the proposed method in predicting detailed future frames of videos on multiple real-world datasets, both action-free and action-conditioned. We find that our proposed method produces substantially improved video predictions when compared to the same model without stochasticity, and to other stochastic video prediction methods. Our SV2P implementation will be open sourced upon publication.", "keywords": "video prediction;stochastic prediction;variational inference;unsupervised learning", "primary_area": "", "supplementary_material": "", "author": "Mohammad Babaeizadeh;Chelsea Finn;Dumitru Erhan;Roy H. Campbell;Sergey Levine", "authorids": "mb2@uiuc.edu;cbfinn@eecs.berkeley.edu;dumitru@google.com;rhc@illinois.edu;svlevine@eecs.berkeley.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nbabaeizadeh2018stochastic,\ntitle={Stochastic Variational Video Prediction},\nauthor={Mohammad Babaeizadeh and Chelsea Finn and Dumitru Erhan and Roy H. Campbell and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rk49Mg-CW},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=rk49Mg-CW)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;4;5", "rating_avg": 7.0, "confidence_avg": 4.333333333333333, "replies_avg": 13, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 671, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16282826800103721009&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=rk49Mg-CW", "pdf": "https://openreview.net/pdf?id=rk49Mg-CW", "email": ";;;;", "author_num": 5 }, { "id": "rk4Fz2e0b", "title": "Graph Partition Neural Networks for Semi-Supervised Classification", "track": "main", "status": "Workshop", "tldr": "", "abstract": "We present graph partition neural networks (GPNN), an extension of graph neural networks (GNNs) able to handle extremely large graphs. GPNNs alternate between locally propagating information between nodes in small subgraphs and globally propagating information between the subgraphs. To efficiently partition graphs, we experiment with spectral partitioning and also propose a modified multi-seed flood fill for fast processing of large scale graphs. We extensively test our model on a variety of semi-supervised node classification tasks. Experimental results indicate that GPNNs are either superior or comparable to state-of-the-art methods on a wide variety of datasets for graph-based semi-supervised classification. We also show that GPNNs can achieve similar performance as standard GNNs with fewer propagation steps.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Renjie Liao;Marc Brockschmidt;Daniel Tarlow;Alexander Gaunt;Raquel Urtasun;Richard S. Zemel", "authorids": "rjliao@cs.toronto.edu;mabrocks@microsoft.com;dtarlow@google.com;algaunt@microsoft.com;urtasun@cs.toronto.edu;zemel@cs.toronto.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nliao2018graph,\ntitle={Graph Partition Neural Networks for Semi-Supervised Classification},\nauthor={Renjie Liao and Marc Brockschmidt and Daniel Tarlow and Alexander Gaunt and Raquel Urtasun and Richard S. Zemel},\nyear={2018},\nurl={https://openreview.net/forum?id=rk4Fz2e0b},\n}", "github": "[![github](/images/github_icon.svg) Microsoft/graph-partition-neural-network-samples](https://github.com/Microsoft/graph-partition-neural-network-samples)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=rk4Fz2e0b", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;3;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.0, "replies_avg": 7, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 91, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5737279776683826768&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "rk6H0ZbRb", "title": "Intriguing Properties of Adversarial Examples", "track": "main", "status": "Workshop", "tldr": "Adversarial error has similar power-law form for all datasets and models studied, and architecture matters.", "abstract": "It is becoming increasingly clear that many machine learning classifiers are vulnerable to adversarial examples. In attempting to explain the origin of adversarial examples, previous studies have typically focused on the fact that neural networks operate on high dimensional data, they overfit, or they are too linear. Here we show that distributions of logit differences have a universal functional form. This functional form is independent of architecture, dataset, and training protocol; nor does it change during training. This leads to adversarial error having a universal scaling, as a power-law, with respect to the size of the adversarial perturbation. We show that this universality holds for a broad range of datasets (MNIST, CIFAR10, ImageNet, and random data), models (including state-of-the-art deep networks, linear models, adversarially trained networks, and networks trained on randomly shuffled labels), and attacks (FGSM, step l.l., PGD). Motivated by these results, we study the effects of reducing prediction entropy on adversarial robustness. Finally, we study the effect of network architectures on adversarial sensitivity. To do this, we use neural architecture search with reinforcement learning to find adversarially robust architectures on CIFAR10. Our resulting architecture is more robust to white \\emph{and} black box attacks compared to previous attempts.\n", "keywords": "adversarial examples;universality;neural architecture search", "primary_area": "", "supplementary_material": "", "author": "Ekin Dogus Cubuk;Barret Zoph;Samuel Stern Schoenholz;Quoc V. Le", "authorids": "cubuk@google.com;barretzoph@google.com;schsam@google.com;qvl@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ndogus2018intriguing,\ntitle={Intriguing Properties of Adversarial Examples},\nauthor={Ekin Dogus Cubuk and Barret Zoph and Samuel Stern Schoenholz and Quoc V. Le},\nyear={2018},\nurl={https://openreview.net/forum?id=rk6H0ZbRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rk6H0ZbRb", "pdf_size": 0, "rating": "3;5;8", "confidence": "4;2;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.0, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": -0.3973597071195132, "gs_citation": 102, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18221611654607406641&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "title": "Learning Intrinsic Sparse Structures within Long Short-Term Memory", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/337", "id": "rk6cfpRjZ", "author_site": "Wei Wen, Yuxiong He, Samyam Rajbhandari, Minjia Zhang, Wenhan Wang, Fang Liu, Bin Hu, Yiran Chen, Hai Li", "tldr": "", "abstract": "Model compression is significant for the wide adoption of Recurrent Neural Networks (RNNs) in both user devices possessing limited resources and business clusters requiring quick responses to large-scale service requests. This work aims to learn structurally-sparse Long Short-Term Memory (LSTM) by reducing the sizes of basic structures within LSTM units, including input updates, gates, hidden states, cell states and outputs. Independently reducing the sizes of basic structures can result in inconsistent dimensions among them, and consequently, end up with invalid LSTM units. To overcome the problem, we propose Intrinsic Sparse Structures (ISS) in LSTMs. Removing a component of ISS will simultaneously decrease the sizes of all basic structures by one and thereby always maintain the dimension consistency. By learning ISS within LSTM units, the obtained LSTMs remain regular while having much smaller basic structures. Based on group Lasso regularization, our method achieves 10.59x speedup without losing any perplexity of a language modeling of Penn TreeBank dataset. It is also successfully evaluated through a compact model with only 2.69M weights for machine Question Answering of SQuAD dataset. Our approach is successfully extended to non- LSTM RNNs, like Recurrent Highway Networks (RHNs). Our source code is available.", "keywords": "Sparsity;Model Compression;Acceleration;LSTMs;Recurrent Neural Networks;Structural Learning", "primary_area": "", "supplementary_material": "", "author": "Wei Wen;Yuxiong He;Samyam Rajbhandari;Minjia Zhang;Wenhan Wang;Fang Liu;Bin Hu;Yiran Chen;Hai Li", "authorids": "wei.wen@duke.edu;yuxhe@microsoft.com;samyamr@microsoft.com;minjiaz@microsoft.com;wenhanw@microsoft.com;fangliu@microsoft.com;binhu@microsoft.com;yiran.chen@duke.edu;hai.li@duke.edu", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "@inproceedings{\nwen2018learning,\ntitle={Learning Intrinsic Sparse Structures within Long Short-Term Memory},\nauthor={Wei Wen and Yuxiong He and Samyam Rajbhandari and Minjia Zhang and Wenhan Wang and Fang Liu and Bin Hu and Yiran Chen and Hai Li},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rk6cfpRjZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 9, "corr_rating_confidence": 0.0, "gs_citation": 161, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9492556084863806404&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=rk6cfpRjZ", "pdf": "https://openreview.net/pdf?id=rk6cfpRjZ", "email": ";;;;;;;;", "author_num": 9 }, { "id": "rk6qdGgCZ", "title": "Fixing Weight Decay Regularization in Adam", "track": "main", "status": "Reject", "tldr": "Fixing weight decay regularization in adaptive gradient methods such as Adam", "abstract": "We note that common implementations of adaptive gradient algorithms, such as Adam, limit the potential benefit of weight decay regularization, because the weights do not decay multiplicatively (as would be expected for standard weight decay) but by an additive constant factor. \nWe propose a simple way to resolve this issue by decoupling weight decay and the optimization steps taken w.r.t. the loss function. We provide empirical evidence that our proposed modification (i) \ndecouples the optimal choice of weight decay factor from the setting of the learning rate for both standard SGD and Adam, and (ii) substantially improves Adam's generalization performance, allowing it to compete with SGD with momentum on image classification datasets (on which it was previously typically outperformed by the latter).\nWe also demonstrate that longer optimization runs require smaller weight decay values for optimal results and introduce a normalized variant of weight decay to reduce this dependence. Finally, we propose a version of Adam with warm restarts (AdamWR) that has strong anytime performance while achieving state-of-the-art results on CIFAR-10 and ImageNet32x32. \nOur source code will become available after the review process.", "keywords": "Adam;Adaptive Gradient Methods;weight decay;L2 regularization", "primary_area": "", "supplementary_material": "", "author": "Ilya Loshchilov;Frank Hutter", "authorids": "ilya.loshchilov@gmail.com;fh@cs.uni-freiburg.de", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nloshchilov2018fixing,\ntitle={Fixing Weight Decay Regularization in Adam},\nauthor={Ilya Loshchilov and Frank Hutter},\nyear={2018},\nurl={https://openreview.net/forum?id=rk6qdGgCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rk6qdGgCZ", "pdf_size": 0, "rating": "4;7;8", "confidence": "4;4;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 23, "authors#_avg": 2, "corr_rating_confidence": -0.6933752452815364, "gs_citation": 2282, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6789134442258801173&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "rk8R_JWRW", "title": "Gating out sensory noise in a spike-based Long Short-Term Memory network", "track": "main", "status": "Reject", "tldr": " We demonstrate a gated recurrent asynchronous spiking neural network that corresponds to an LSTM unit.", "abstract": "Spiking neural networks are being investigated both as biologically plausible models of neural computation and also as a potentially more efficient type of neural network. While convolutional spiking neural networks have been demonstrated to achieve near state-of-the-art performance, only one solution has been proposed to convert gated recurrent neural networks, so far.\nRecurrent neural networks in the form of networks of gating memory cells have been central in state-of-the-art solutions in problem domains that involve sequence recognition or generation. Here, we design an analog gated LSTM cell where its neurons can be substituted for efficient stochastic spiking neurons. These adaptive spiking neurons implement an adaptive form of sigma-delta coding to convert internally computed analog activation values to spike-trains. For such neurons, we approximate the effective activation function, which resembles a sigmoid. We show how analog neurons with such activation functions can be used to create an analog LSTM cell; networks of these cells can then be trained with standard backpropagation. We train these LSTM networks on a noisy and noiseless version of the original sequence prediction task from Hochreiter & Schmidhuber (1997), and also on a noisy and noiseless version of a classical working memory reinforcement learning task, the T-Maze. Substituting the analog neurons for corresponding adaptive spiking neurons, we then show that almost all resulting spiking neural network equivalents correctly compute the original tasks.", "keywords": "spiking neural networks;LSTM;recurrent neural networks", "primary_area": "", "supplementary_material": "", "author": "Davide Zambrano;Isabella Pozzi;Roeland Nusselder;Sander Bohte", "authorids": "d.zambrano@cwi.nl;isabella.pozzi@cwi.nl;roeland.nusselder@gmail.com;s.m.bohte@cwi.nl", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzambrano2018gating,\ntitle={Gating out sensory noise in a spike-based Long Short-Term Memory network},\nauthor={Davide Zambrano and Isabella Pozzi and Roeland Nusselder and Sander Bohte},\nyear={2018},\nurl={https://openreview.net/forum?id=rk8R_JWRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rk8R_JWRW", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;3;4", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:IfUdbNE4zRQJ:scholar.google.com/&scioq=Gating+out+sensory+noise+in+a+spike-based+Long+Short-Term+Memory+network&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rk8wKk-R-", "title": "Convolutional Sequence Modeling Revisited", "track": "main", "status": "Workshop", "tldr": "We argue that convolutional networks should be considered the default starting point for sequence modeling tasks.", "abstract": "This paper revisits the problem of sequence modeling using convolutional \narchitectures. Although both convolutional and recurrent architectures have a\nlong history in sequence prediction, the current \"default\" mindset in much of\nthe deep learning community is that generic sequence modeling is best handled\nusing recurrent networks. The goal of this paper is to question this assumption. \nSpecifically, we consider a simple generic temporal convolution network (TCN),\nwhich adopts features from modern ConvNet architectures such as a dilations and \nresidual connections. We show that on a variety of sequence modeling tasks,\nincluding many frequently used as benchmarks for evaluating recurrent networks,\nthe TCN outperforms baseline RNN methods (LSTMs, GRUs, and vanilla RNNs) and\nsometimes even highly specialized approaches. We further show that the\npotential \"infinite memory\" advantage that RNNs have over TCNs is largely\nabsent in practice: TCNs indeed exhibit longer effective history sizes than their \nrecurrent counterparts. As a whole, we argue that it may be time to (re)consider \nConvNets as the default \"go to\" architecture for sequence modeling.", "keywords": "Temporal Convolutional Network;Sequence Modeling;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Shaojie Bai;J. Zico Kolter;Vladlen Koltun", "authorids": "shaojieb@cs.cmu.edu;zkolter@cs.cmu.edu;vkoltun@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbai2018convolutional,\ntitle={Convolutional Sequence Modeling Revisited},\nauthor={Shaojie Bai and J. Zico Kolter and Vladlen Koltun},\nyear={2018},\nurl={https://openreview.net/forum?id=rk8wKk-R-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rk8wKk-R-", "pdf_size": 0, "rating": "4;5;8", "confidence": "3;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 18, "authors#_avg": 3, "corr_rating_confidence": 0.6933752452815364, "gs_citation": 80, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1953324364890601770&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "rk9kKMZ0-", "title": "LEAP: Learning Embeddings for Adaptive Pace", "track": "main", "status": "Reject", "tldr": "LEAP combines the strength of adaptive sampling with that of mini-batch online learning and adaptive representation learning to formulate a representative self-paced strategy in an end-to-end DNN training protocol. ", "abstract": "Determining the optimal order in which data examples are presented to Deep Neural Networks during training is a non-trivial problem. However, choosing a non-trivial scheduling method may drastically improve convergence. In this paper, we propose a Self-Paced Learning (SPL)-fused Deep Metric Learning (DML) framework, which we call Learning Embeddings for Adaptive Pace (LEAP). Our method parameterizes mini-batches dynamically based on the \\textit{easiness} and \\textit{true diverseness} of the sample within a salient feature representation space. In LEAP, we train an \\textit{embedding} Convolutional Neural Network (CNN) to learn an expressive representation space by adaptive density discrimination using the Magnet Loss. The \\textit{student} CNN classifier dynamically selects samples to form a mini-batch based on the \\textit{easiness} from cross-entropy losses and \\textit{true diverseness} of examples from the representation space sculpted by the \\textit{embedding} CNN. We evaluate LEAP using deep CNN architectures for the task of supervised image classification on MNIST, FashionMNIST, CIFAR-10, CIFAR-100, and SVHN. We show that the LEAP framework converges faster with respect to the number of mini-batch updates required to achieve a comparable or better test performance on each of the datasets.", "keywords": "deep metric learning;self-paced learning;representation learning;cnn", "primary_area": "", "supplementary_material": "", "author": "Vithursan Thangarasa;Graham W. Taylor", "authorids": "vthangar@uoguelph.ca;gwtaylor@uoguelph.ca", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nthangarasa2018leap,\ntitle={{LEAP}: Learning Embeddings for Adaptive Pace},\nauthor={Vithursan Thangarasa and Graham W. Taylor},\nyear={2018},\nurl={https://openreview.net/forum?id=rk9kKMZ0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=rk9kKMZ0-", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;4;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 6, "authors#_avg": 2, "corr_rating_confidence": -0.944911182523068, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:iGGOzVeGV6oJ:scholar.google.com/&scioq=LEAP:+Learning+Embeddings+for+Adaptive+Pace&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rkA1f3NpZ", "title": "Ensemble Methods as a Defense to Adversarial Perturbations Against Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "Using ensemble methods as a defense to adversarial perturbations against deep neural networks.", "abstract": "Deep learning has become the state of the art approach in many machine learning problems such as classification. It has recently been shown that deep learning is highly vulnerable to adversarial perturbations. Taking the camera systems of self-driving cars as an example, small adversarial perturbations can cause the system to make errors in important tasks, such as classifying traffic signs or detecting pedestrians. Hence, in order to use deep learning without safety concerns a proper defense strategy is required. We propose to use ensemble methods as a defense strategy against adversarial perturbations. We find that an attack leading one model to misclassify does not imply the same for other networks performing the same task. This makes ensemble methods an attractive defense strategy against adversarial attacks. We empirically show for the MNIST and the CIFAR-10 data sets that ensemble methods not only improve the accuracy of neural networks on test data but also increase their robustness against adversarial perturbations.", "keywords": "Ensemble Method;Adversarial Perturbations;Deep Neural Networks;Defense;Attack", "primary_area": "", "supplementary_material": "", "author": "Thilo Strauss;Markus Hanselmann;Andrej Junginger;Holger Ulmer", "authorids": "thilo.strauss@etas.com;markus.hanselmann@etas.com;andrej.junginger@etas.com;holger.ulmer@etas.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nstrauss2018ensemble,\ntitle={Ensemble Methods as a Defense to Adversarial Perturbations Against Deep Neural Networks},\nauthor={Thilo Strauss and Markus Hanselmann and Andrej Junginger and Holger Ulmer},\nyear={2018},\nurl={https://openreview.net/forum?id=rkA1f3NpZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rkA1f3NpZ", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;3;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": -0.7559289460184545, "gs_citation": 187, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13617355350210627571&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "rkEfPeZRb", "title": "Variance-based Gradient Compression for Efficient Distributed Deep Learning", "track": "main", "status": "Workshop", "tldr": "A new algorithm to reduce the communication overhead of distributed deep learning by distinguishing \u2018unambiguous\u2019 gradients.", "abstract": "Due to the substantial computational cost, training state-of-the-art deep neural networks for large-scale datasets often requires distributed training using multiple computation workers. However, by nature, workers need to frequently communicate gradients, causing severe bottlenecks, especially on lower bandwidth connections. A few methods have been proposed to compress gradient for efficient communication, but they either suffer a low compression ratio or significantly harm the resulting model accuracy, particularly when applied to convolutional neural networks. To address these issues, we propose a method to reduce the communication overhead of distributed deep learning. Our key observation is that gradient updates can be delayed until an unambiguous (high amplitude, low variance) gradient has been calculated. We also present an efficient algorithm to compute the variance and prove that it can be obtained with negligible additional cost. We experimentally show that our method can achieve very high compression ratio while maintaining the result model accuracy. We also analyze the efficiency using computation and communication cost models and provide the evidence that this method enables distributed deep learning for many scenarios with commodity environments.", "keywords": "distributed deep learning;gradient compression;collective communication;data parallel distributed sgd;image classification", "primary_area": "", "supplementary_material": "", "author": "Yusuke Tsuzuku;Hiroto Imachi;Takuya Akiba", "authorids": "tsuzuku@ms.k.u-tokyo.ac.jp;imachi@preferred.jp;akiba@preferred.jp", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ntsuzuku2018variancebased,\ntitle={Variance-based Gradient Compression for Efficient Distributed Deep Learning},\nauthor={Yusuke Tsuzuku and Hiroto Imachi and Takuya Akiba},\nyear={2018},\nurl={https://openreview.net/forum?id=rkEfPeZRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rkEfPeZRb", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 94, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12956190793258262639&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "rkEtzzWAb", "title": "Parametric Adversarial Divergences are Good Task Losses for Generative Modeling", "track": "main", "status": "Workshop", "tldr": "Parametric adversarial divergences implicitly define more meaningful task losses for generative modeling, we make parallels with structured prediction to study the properties of these divergences and their ability to encode the task of interest.", "abstract": "Generative modeling of high dimensional data like images is a notoriously difficult and ill-defined problem. In particular, how to evaluate a learned generative model is unclear.\nIn this paper, we argue that *adversarial learning*, pioneered with generative adversarial networks (GANs), provides an interesting framework to implicitly define more meaningful task losses for unsupervised tasks, such as for generating \"visually realistic\" images. By relating GANs and structured prediction under the framework of statistical decision theory, we put into light links between recent advances in structured prediction theory and the choice of the divergence in GANs. We argue that the insights about the notions of \"hard\" and \"easy\" to learn losses can be analogously extended to adversarial divergences. We also discuss the attractive properties of parametric adversarial divergences for generative modeling, and perform experiments to show the importance of choosing a divergence that reflects the final task.", "keywords": "parametric;adversarial;divergence;generative;modeling;gan;neural;network;task;loss;structured;prediction", "primary_area": "", "supplementary_material": "", "author": "Gabriel Huang;Hugo Berard;Ahmed Touati;Gauthier Gidel;Pascal Vincent;Simon Lacoste-Julien", "authorids": "gbxhuang@gmail.com;berard.hugo@gmail.com;ahmed.touati@umontreal.ca;gauthier.gidel@inria.fr;pascal.vincent@umontreal.ca;slacoste@iro.umontreal.ca", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nhuang2018parametric,\ntitle={Parametric Adversarial Divergences are Good Task Losses for Generative Modeling},\nauthor={Gabriel Huang and Hugo Berard and Ahmed Touati and Gauthier Gidel and Pascal Vincent and Simon Lacoste-Julien},\nyear={2018},\nurl={https://openreview.net/forum?id=rkEtzzWAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rkEtzzWAb", "pdf_size": 0, "rating": "4;4;6", "confidence": "3;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 17, "authors#_avg": 6, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1846743194234830491&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "rkGZuJb0b", "title": "Compact Neural Networks based on the Multiscale Entanglement Renormalization Ansatz", "track": "main", "status": "Reject", "tldr": "We replace the fully connected layers of a neural network with the multi-scale entanglement renormalization ansatz, a type of quantum operation which describes long range correlations. ", "abstract": "The goal of this paper is to demonstrate a method for tensorizing neural networks based upon an efficient way of approximating scale invariant quantum states, the Multi-scale Entanglement Renormalization Ansatz (MERA). We employ MERA as a replacement for linear layers in a neural network and test this implementation on the CIFAR-10 dataset. The proposed method outperforms factorization using tensor trains, providing greater compression for the same level of accuracy and greater accuracy for the same level of compression. We demonstrate MERA-layers with 3900 times fewer parameters and a reduction in accuracy of less than 1% compared to the equivalent fully connected layers.\n", "keywords": "Neural Networks;Tensor Networks;Tensor Trains", "primary_area": "", "supplementary_material": "", "author": "Andrew Hallam;Edward Grant;Vid Stojevic;Simone Severini;Andrew G. Green", "authorids": "andrew.hallam.10@ucl.ac.uk;edward.grant.16@ucl.ac.uk;vstojevic@gtn.ai;s.severini@ucl.ac.uk;andrew.green@ucl.ac.uk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nhallam2018compact,\ntitle={Compact Neural Networks based on the Multiscale Entanglement Renormalization Ansatz},\nauthor={Andrew Hallam and Edward Grant and Vid Stojevic and Simone Severini and Andrew G. Green},\nyear={2018},\nurl={https://openreview.net/forum?id=rkGZuJb0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rkGZuJb0b", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;3;4", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 5, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12591425231848530247&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8 }, { "title": "The Reactor: A fast and sample-efficient Actor-Critic agent for Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/133", "id": "rkHVZWZAZ", "author_site": "Audrunas Gruslys, Will Dabney, Mohammad Gheshlaghi Azar, Bilal Piot, Marc G Bellemare, Remi Munos", "tldr": "Reactor combines multiple algorithmic and architectural contributions to produce an agent with higher sample-efficiency than Prioritized Dueling DQN while giving better run-time performance than A3C.", "abstract": "In this work we present a new agent architecture, called Reactor, which combines multiple algorithmic and architectural contributions to produce an agent with higher sample-efficiency than Prioritized Dueling DQN (Wang et al., 2016) and Categorical DQN (Bellemare et al., 2017), while giving better run-time performance than A3C (Mnih et al., 2016). Our first contribution is a new policy evaluation algorithm called Distributional Retrace, which brings multi-step off-policy updates to the distributional reinforcement learning setting. The same approach can be used to convert several classes of multi-step policy evaluation algorithms designed for expected value evaluation into distributional ones. Next, we introduce the \u03b2-leaveone-out policy gradient algorithm which improves the trade-off between variance and bias by using action values as a baseline. Our final algorithmic contribution is a new prioritized replay algorithm for sequences, which exploits the temporal locality of neighboring observations for more efficient replay prioritization. Using the Atari 2600 benchmarks, we show that each of these innovations contribute to both the sample efficiency and final agent performance. Finally, we demonstrate that Reactor reaches state-of-the-art performance after 200 million frames and less than a day of training.", "keywords": "reinforcement learning;policy gradient;distributional reinforcement learning;distributed computing", "primary_area": "", "supplementary_material": "", "author": "Audrunas Gruslys;Will Dabney;Mohammad Gheshlaghi Azar;Bilal Piot;Marc Bellemare;Remi Munos", "authorids": "audrunas@google.com;wdabney@google.com;mazar@google.com;piot@google.com;bellemare@google.com;munos@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\ngruslys2018the,\ntitle={The Reactor: A fast and sample-efficient Actor-Critic agent for Reinforcement Learning},\nauthor={Audrunas Gruslys and Will Dabney and Mohammad Gheshlaghi Azar and Bilal Piot and Marc Bellemare and Remi Munos},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rkHVZWZAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;2;4", "rating_avg": 7.0, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 116, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8283235639510258055&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=rkHVZWZAZ", "pdf": "https://openreview.net/pdf?id=rkHVZWZAZ", "email": ";;;;;", "author_num": 6 }, { "title": "Learning Robust Rewards with Adverserial Inverse Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/148", "id": "rkHywl-A-", "author_site": "Justin Fu, Katie Luo, Sergey Levine", "tldr": "We propose an adversarial inverse reinforcement learning algorithm capable of learning reward functions which can transfer to new, unseen environments.", "abstract": "Reinforcement learning provides a powerful and general framework for decision\nmaking and control, but its application in practice is often hindered by the need\nfor extensive feature and reward engineering. Deep reinforcement learning methods\ncan remove the need for explicit engineering of policy or value features, but\nstill require a manually specified reward function. Inverse reinforcement learning\nholds the promise of automatic reward acquisition, but has proven exceptionally\ndifficult to apply to large, high-dimensional problems with unknown dynamics. In\nthis work, we propose AIRL, a practical and scalable inverse reinforcement learning\nalgorithm based on an adversarial reward learning formulation that is competitive\nwith direct imitation learning algorithms. Additionally, we show that AIRL is\nable to recover portable reward functions that are robust to changes in dynamics,\nenabling us to learn policies even under significant variation in the environment\nseen during training.", "keywords": "inverse reinforcement learning;deep reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Justin Fu;Katie Luo;Sergey Levine", "authorids": "justinjfu@eecs.berkeley.edu;katieluo@berkeley.edu;svlevine@eecs.berkeley.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nfu2018learning,\ntitle={Learning Robust Rewards with Adverserial Inverse Reinforcement Learning},\nauthor={Justin Fu and Katie Luo and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rkHywl-A-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;2;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.0, "replies_avg": 20, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 1194, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15373093331363538470&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=rkHywl-A-", "pdf": "https://openreview.net/pdf?id=rkHywl-A-", "email": ";;", "author_num": 3 }, { "title": "Neumann Optimizer: A Practical Optimization Algorithm for Deep Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/174", "id": "rkLyJl-0-", "author_site": "Shankar Krishnan, Ying Xiao, Rif A. Saurous", "tldr": "We describe a practical optimization algorithm for deep neural networks that works faster and generates better models compared to widely used algorithms.", "abstract": "Progress in deep learning is slowed by the days or weeks it takes to train large models. The natural solution of using more hardware is limited by diminishing returns, and leads to inefficient use of additional resources. In this paper, we present a large batch, stochastic optimization algorithm that is both faster than widely used algorithms for fixed amounts of computation, and also scales up substantially better as more computational resources become available. Our algorithm implicitly computes the inverse Hessian of each mini-batch to produce descent directions; we do so without either an explicit approximation to the Hessian or Hessian-vector products. We demonstrate the effectiveness of our algorithm by successfully training large ImageNet models (InceptionV3, ResnetV1-50, ResnetV1-101 and InceptionResnetV2) with mini-batch sizes of up to 32000 with no loss in validation error relative to current baselines, and no increase in the total number of steps. At smaller mini-batch sizes, our optimizer improves the validation error in these models by 0.8-0.9\\%. Alternatively, we can trade off this accuracy to reduce the number of training steps needed by roughly 10-30\\%. Our work is practical and easily usable by others -- only one hyperparameter (learning rate) needs tuning, and furthermore, the algorithm is as computationally cheap as the commonly used Adam optimizer.", "keywords": "Deep Learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Shankar Krishnan;Ying Xiao;Rif. A. Saurous", "authorids": "skrishnan@google.com;yingxiao@google.com;rif@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nkrishnan2018neumann,\ntitle={Neumann Optimizer: A Practical Optimization Algorithm for Deep Neural Networks},\nauthor={Shankar Krishnan and Ying Xiao and Rif. A. Saurous},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rkLyJl-0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;3;3", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14026126670370894242&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=rkLyJl-0-", "pdf": "https://openreview.net/pdf?id=rkLyJl-0-", "email": ";;", "author_num": 3 }, { "id": "rkMt1bWAZ", "title": "Bias-Variance Decomposition for Boltzmann Machines", "track": "main", "status": "Reject", "tldr": "We achieve bias-variance decomposition for Boltzmann machines using an information geometric formulation.", "abstract": "We achieve bias-variance decomposition for Boltzmann machines using an information geometric formulation. Our decomposition leads to an interesting phenomenon that the variance does not necessarily increase when more parameters are included in Boltzmann machines, while the bias always decreases. Our result gives a theoretical evidence of the generalization ability of deep learning architectures because it provides the possibility of increasing the representation power with avoiding the variance inflation.", "keywords": "Boltzmann machine;bias-variance decomposition;information geometry", "primary_area": "", "supplementary_material": "", "author": "Mahito Sugiyama;Koji Tsuda;Hiroyuki Nakahara", "authorids": "mahito@nii.ac.jp;tsuda@k.u-tokyo.ac.jp;hiro@brain.riken.jp", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsugiyama2018biasvariance,\ntitle={Bias-Variance Decomposition for Boltzmann Machines},\nauthor={Mahito Sugiyama and Koji Tsuda and Hiroyuki Nakahara},\nyear={2018},\nurl={https://openreview.net/forum?id=rkMt1bWAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rkMt1bWAZ", "pdf_size": 0, "rating": "5;5;7", "confidence": "2;5;5", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.49999999999999994, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:a_RpPOyFZT4J:scholar.google.com/&scioq=Bias-Variance+Decomposition+for+Boltzmann+Machines&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "SCAN: Learning Hierarchical Compositional Visual Concepts", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/150", "id": "rkN2Il-RZ", "author_site": "Irina Higgins, Nicolas Sonnerat, Loic Matthey, Arka Pal, Christopher Burgess, Matko Bo\u0161njak, Murray Shanahan, Matthew Botvinick, , Alexander Lerchner", "tldr": "We present a neural variational model for learning language-guided compositional visual concepts.", "abstract": "The seemingly infinite diversity of the natural world arises from a relatively small set of coherent rules, such as the laws of physics or chemistry. We conjecture that these rules give rise to regularities that can be discovered through primarily unsupervised experiences and represented as abstract concepts. If such representations are compositional and hierarchical, they can be recombined into an exponentially large set of new concepts. This paper describes SCAN (Symbol-Concept Association Network), a new framework for learning such abstractions in the visual domain. SCAN learns concepts through fast symbol association, grounding them in disentangled visual primitives that are discovered in an unsupervised manner. Unlike state of the art multimodal generative model baselines, our approach requires very few pairings between symbols and images and makes no assumptions about the form of symbol representations. Once trained, SCAN is capable of multimodal bi-directional inference, generating a diverse set of image samples from symbolic descriptions and vice versa. It also allows for traversal and manipulation of the implicit hierarchy of visual concepts through symbolic instructions and learnt logical recombination operations. Such manipulations enable SCAN to break away from its training data distribution and imagine novel visual concepts through symbolically instructed recombination of previously learnt concepts.", "keywords": "grounded visual concepts;compositional representation;concept hierarchy;disentangling;beta-VAE;variational autoencoder;deep learning;generative model", "primary_area": "", "supplementary_material": "", "author": "Irina Higgins;Nicolas Sonnerat;Loic Matthey;Arka Pal;Christopher P Burgess;Matko Bo\u0161njak;Murray Shanahan;Matthew Botvinick;Demis Hassabis;Alexander Lerchner", "authorids": "irinah@google.com;sonnerat@google.com;lmatthey@google.com;arkap@google.com;cpburgess@google.com;matko@google.com;mshanahan@google.com;botvinick@google.com;demishassabis@google.com;lerchner@google.com", "gender": ";;;;;;;;;", "homepage": ";;;;;;;;;", "dblp": ";;;;;;;;;", "google_scholar": ";;;;;;;;;", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;;;", "or_profile": ";;;;;;;;;", "aff": ";;;;;;;;;", "aff_domain": ";;;;;;;;;", "position": ";;;;;;;;;", "bibtex": "@inproceedings{\nhiggins2018scan,\ntitle={{SCAN}: Learning Hierarchical Compositional Visual Concepts},\nauthor={Irina Higgins and Nicolas Sonnerat and Loic Matthey and Arka Pal and Christopher P Burgess and Matko Bo\u0161njak and Murray Shanahan and Matthew Botvinick and Demis Hassabis and Alexander Lerchner},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rkN2Il-RZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 10, "corr_rating_confidence": 0.0, "gs_citation": 151, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9825853684146540580&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=rkN2Il-RZ", "pdf": "https://openreview.net/pdf?id=rkN2Il-RZ", "email": ";;;;;;;;;", "author_num": 10 }, { "id": "rkNzJ4m0-", "title": "Overview on Reinforcement Learning for Robotics", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Reinforcement Learning(RL) offers robotics tasks a framework and set of tools for the design of sophisticated and hard-to-engineer behaviors.[1]\nReinforcement Learning is an branch of Machine Learning inspired by behaviorist psychology[wiki- Reinforcemenr Learning], concerned with how software agents ought to take actions in an environment so as to maximize some notion of cumulative reward.\nThe basic idea of Reinforcement Learning is to obtain a policy that extract more reward from the environment by picking actions given a state.\nBy policy, we mean a decision maker (Agent) that decide on an action based on some parameterized rules given an input observation of environment (State). The policy can be a set of weight that linearly combine the features in a state or different structured Neural Network. The environment in Reinforcement Learning context provide the agent a new state and reward immediately after the agent takes a specific action.\nFrom a more broad view, the Machine Learning method was mainly three folds. The Supervised Learning, Semi-supervised Learning and Unsupervised Learning. The supervised learning network was trained given a dataset including the observation data and the corresponding categorization. The latter was given a dataset that no classification is label to the observation data. For reinforcement Learning, it is more close to supervised learning, while its label is obtained by exploring the environment and get feedback (reward, r) from it. The RL algorithm marks the policy that generates the highest score as the training target and make small change of its parameters (or weights, \u03b8) towards that policy until the policy converge.\nIn this report, we mainly focus on the methods of Reinforcement Learning methods for robotics.", "keywords": "Reinforcement Learning;Robotics", "primary_area": "", "supplementary_material": "", "author": "Anonymous", "authorids": "ICLR.cc/2018/Conference/Paper23/Authors", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@article{ \nanonymous2018overview, \ntitle={Overview on Reinforcement Learning for Robotics}, \nauthor={Anonymous}, \njournal={International Conference on Learning Representations}, \nyear={2018} \n}", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=rkNzJ4m0-", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 0, "authors#_avg": 1, "corr_rating_confidence": 0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "Memorization Precedes Generation: Learning Unsupervised GANs with Memory Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/292", "id": "rkO3uTkAZ", "author_site": "Youngjin Kim, Minjung Kim, Gunhee Kim", "tldr": "", "abstract": "We propose an approach to address two issues that commonly occur during training of unsupervised GANs. First, since GANs use only a continuous latent distribution to embed multiple classes or clusters of data, they often do not correctly handle the structural discontinuity between disparate classes in a latent space. Second, discriminators of GANs easily forget about past generated samples by generators, incurring instability during adversarial training. We argue that these two infamous problems of unsupervised GAN training can be largely alleviated by a learnable memory network to which both generators and discriminators can access. Generators can effectively learn representation of training samples to understand underlying cluster distributions of data, which ease the structure discontinuity problem. At the same time, discriminators can better memorize clusters of previously generated samples, which mitigate the forgetting problem. We propose a novel end-to-end GAN model named memoryGAN, which involves a memory network that is unsupervisedly trainable and integrable to many existing GAN models. With evaluations on multiple datasets such as Fashion-MNIST, CelebA, CIFAR10, and Chairs, we show that our model is probabilistically interpretable, and generates realistic image samples of high visual fidelity. The memoryGAN also achieves the state-of-the-art inception scores over unsupervised GAN models on the CIFAR10 dataset, without any optimization tricks and weaker divergences.", "keywords": "Generative Adversarial Networks;Memory Networks", "primary_area": "", "supplementary_material": "", "author": "Youngjin Kim;Minjung Kim;Gunhee Kim", "authorids": "youngjin.kim@vision.snu.ac.kr;minjung.kim1994@gmail.com;gunhee@snu.ac.kr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nkim2018memorization,\ntitle={Memorization Precedes Generation: Learning Unsupervised {GAN}s with Memory Networks},\nauthor={Youngjin Kim and Minjung Kim and Gunhee Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rkO3uTkAZ},\n}", "github": "[![github](/images/github_icon.svg) whyjay/memoryGAN](https://github.com/whyjay/memoryGAN)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7548592689214672445&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=rkO3uTkAZ", "pdf": "https://openreview.net/pdf?id=rkO3uTkAZ", "email": ";;", "author_num": 3 }, { "id": "rkONG0xAW", "title": "Recursive Binary Neural Network Learning Model with 2-bit/weight Storage Requirement", "track": "main", "status": "Reject", "tldr": "We propose a learning model enabling DNN to learn with only 2 bit/weight, which is especially useful for on-device learning", "abstract": "This paper presents a storage-efficient learning model titled Recursive Binary Neural Networks for embedded and mobile devices having a limited amount of on-chip data storage such as hundreds of kilo-Bytes. The main idea of the proposed model is to recursively recycle data storage of weights (parameters) during training. This enables a device with a given storage constraint to train and instantiate a neural network classifier with a larger number of weights on a chip, achieving better classification accuracy. Such efficient use of on-chip storage reduces off-chip storage accesses, improving energy-efficiency and speed of training. We verified the proposed training model with deep and convolutional neural network classifiers on the MNIST and voice activity detection benchmarks. For the deep neural network, our model achieves data storage requirement of as low as 2 bits/weight, whereas the conventional binary neural network learning models require data storage of 8 to 32 bits/weight. With the same amount of data storage, our model can train a bigger network having more weights, achieving 1% less test error than the conventional binary neural network learning model. To achieve the similar classification error, the conventional binary neural network model requires 4\u00d7 more data storage for weights than our proposed model. For the convolution neural network classifier, the proposed model achieves 2.4% less test error for the same on-chip storage or 6\u00d7 storage savings to achieve the similar accuracy.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tianchan Guan;Xiaoyang Zeng;Mingoo Seok", "authorids": "tg2569@columbia.edu;xyzeng@fudan.edu;ms4415@columbia.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nguan2018recursive,\ntitle={Recursive Binary Neural Network Learning Model with 2-bit/weight Storage Requirement},\nauthor={Tianchan Guan and Xiaoyang Zeng and Mingoo Seok},\nyear={2018},\nurl={https://openreview.net/forum?id=rkONG0xAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rkONG0xAW", "pdf_size": 0, "rating": "5;6;7", "confidence": "3;3;4", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9845907941895558735&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Modular Continual Learning in a Unified Visual Environment", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/165", "id": "rkPLzgZAZ", "author_site": "Kevin Feigelis, Blue Sheffer, Daniel L Yamins", "tldr": "We propose a neural module approach to continual learning using a unified visual environment with a large action space.", "abstract": " A core aspect of human intelligence is the ability to learn new tasks quickly and switch between them flexibly. Here, we describe a modular continual reinforcement learning paradigm inspired by these abilities. We first introduce a visual interaction environment that allows many types of tasks to be unified in a single framework. We then describe a reward map prediction scheme that learns new tasks robustly in the very large state and action spaces required by such an environment. We investigate how properties of module architecture influence efficiency of task learning, showing that a module motif incorporating specific design principles (e.g. early bottlenecks, low-order polynomial nonlinearities, and symmetry) significantly outperforms more standard neural network motifs, needing fewer training examples and fewer neurons to achieve high levels of performance. Finally, we present a meta-controller architecture for task switching based on a dynamic neural voting scheme, which allows new modules to use information learned from previously-seen tasks to substantially improve their own learning efficiency. ", "keywords": "Continual Learning;Neural Modules;Interface Learning;Task Switching;Reinforcement Learning;Visual Decision Making", "primary_area": "", "supplementary_material": "", "author": "Kevin T. Feigelis;Blue Sheffer;Daniel L. K. Yamins", "authorids": "feigelis@stanford.edu;bsheffer@stanford.edu;yamins@stanford.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nt.2018modular,\ntitle={Modular Continual Learning in a Unified Visual Environment},\nauthor={Kevin T. Feigelis and Blue Sheffer and Daniel L. K. Yamins},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rkPLzgZAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;8;8", "confidence": "2;2;3", "rating_avg": 7.333333333333333, "confidence_avg": 2.3333333333333335, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18112736468437527039&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=rkPLzgZAZ", "pdf": "https://openreview.net/pdf?id=rkPLzgZAZ", "email": ";;", "author_num": 3 }, { "title": "Improving GANs Using Optimal Transport", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/296", "id": "rkQkBnJAb", "author_site": "Tim Salimans, Han Zhang, Alec Radford, Dimitris Metaxas", "tldr": "An extension of GANs combining optimal transport in primal form with an energy distance defined in an adversarially learned feature space.", "abstract": "We present Optimal Transport GAN (OT-GAN), a variant of generative adversarial nets minimizing a new metric measuring the distance between the generator distribution and the data distribution. This metric, which we call mini-batch energy distance, combines optimal transport in primal form with an energy distance defined in an adversarially learned feature space, resulting in a highly discriminative distance function with unbiased mini-batch gradients. Experimentally we show OT-GAN to be highly stable when trained with large mini-batches, and we present state-of-the-art results on several popular benchmark problems for image generation.", "keywords": "GAN;generative modeling;adversarial;optimal transport", "primary_area": "", "supplementary_material": "", "author": "Tim Salimans;Han Zhang;Alec Radford;Dimitris Metaxas", "authorids": "tim@openai.com;han.zhang@cs.rutgers.edu;alec@openai.com;dnm@cs.rutgers.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nsalimans2018improving,\ntitle={Improving {GAN}s Using Optimal Transport},\nauthor={Tim Salimans and Han Zhang and Alec Radford and Dimitris Metaxas},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rkQkBnJAb},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=rkQkBnJAb)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;8", "confidence": "3;2;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 388, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3536589889372403455&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=rkQkBnJAb", "pdf": "https://openreview.net/pdf?id=rkQkBnJAb", "email": ";;;", "author_num": 4 }, { "id": "rkQsMCJCb", "title": "Generative Adversarial Networks using Adaptive Convolution", "track": "main", "status": "Reject", "tldr": "We replace normal convolutions with adaptive convolutions to improve GANs generator.", "abstract": "Most existing GANs architectures that generate images use transposed convolution or resize-convolution as their upsampling algorithm from lower to higher resolution feature maps in the generator. We argue that this kind of fixed operation is problematic for GANs to model objects that have very different visual appearances. We propose a novel adaptive convolution method that learns the upsampling algorithm based on the local context at each location to address this problem. We modify a baseline GANs architecture by replacing normal convolutions with adaptive convolutions in the generator. Experiments on CIFAR-10 dataset show that our modified models improve the baseline model by a large margin. Furthermore, our models achieve state-of-the-art performance on CIFAR-10 and STL-10 datasets in the unsupervised setting.", "keywords": "Generative Adversarial Networks;Unsupervised Learning;GANs", "primary_area": "", "supplementary_material": "", "author": "Nhat M. Nguyen;Nilanjan Ray", "authorids": "nmnguyen@ualberta.ca;nray1@ualberta.ca", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nm.2018generative,\ntitle={Generative Adversarial Networks using Adaptive Convolution},\nauthor={Nhat M. Nguyen and Nilanjan Ray},\nyear={2018},\nurl={https://openreview.net/forum?id=rkQsMCJCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rkQsMCJCb", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;5;5", "rating_avg": 4.0, "confidence_avg": 4.666666666666667, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14232777861329925083&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "rkQu4Wb0Z", "title": "DNN Representations as Codewords: Manipulating Statistical Properties via Penalty Regularization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Performance of Deep Neural Network (DNN) heavily depends on the characteristics of hidden layer representations. Unlike the codewords of channel coding, however, the representations of learning cannot be directly designed or controlled. Therefore, we develop a family of penalty regularizers where each one aims to affect one of representation's statistical properties such as sparsity, variance, or covariance. The regularizers are extended to perform class-wise regularization, and the extension is found to provide an outstanding shaping capability. A variety of statistical properties are investigated for 10 different regularization strategies including dropout and batch normalization, and several interesting findings are reported. Using the family of regularizers, performance improvements are confirmed for MNIST, CIFAR-100, and CIFAR-10 classification problems. But more importantly, our results suggest that understanding how to manipulate statistical properties of representations can be an important step toward understanding DNN and that the role and effect of DNN regularizers need to be reconsidered.", "keywords": "DNN representation;penalty regularization;channel coding", "primary_area": "", "supplementary_material": "", "author": "Daeyoung Choi;Changho Shin;Hyunghun Cho;Wonjong Rhee", "authorids": "choid@snu.ac.kr;ch.shin@snu.ac.kr;webofthink@snu.ac.kr;wrhee@snu.ac.kr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nchoi2018dnn,\ntitle={{DNN} Representations as Codewords: Manipulating Statistical Properties via Penalty Regularization},\nauthor={Daeyoung Choi and Changho Shin and Hyunghun Cho and Wonjong Rhee},\nyear={2018},\nurl={https://openreview.net/forum?id=rkQu4Wb0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rkQu4Wb0Z", "pdf_size": 0, "rating": "5;5;5", "confidence": "5;4;3", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7sb2is2s-cIJ:scholar.google.com/&scioq=DNN+Representations+as+Codewords:+Manipulating+Statistical+Properties+via+Penalty+Regularization&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Beyond Word Importance: Contextual Decomposition to Extract Interactions from LSTMs", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/164", "id": "rkRwGg-0Z", "author_site": "William Murdoch, Peter J Liu, Bin Yu", "tldr": "We introduce contextual decompositions, an interpretation algorithm for LSTMs capable of extracting word, phrase and interaction-level importance score", "abstract": "The driving force behind the recent success of LSTMs has been their ability to learn complex and non-linear relationships. Consequently, our inability to describe these relationships has led to LSTMs being characterized as black boxes. To this end, we introduce contextual decomposition (CD), an interpretation algorithm for analysing individual predictions made by standard LSTMs, without any changes to the underlying model. By decomposing the output of a LSTM, CD captures the contributions of combinations of words or variables to the final prediction of an LSTM. On the task of sentiment analysis with the Yelp and SST data sets, we show that CD is able to reliably identify words and phrases of contrasting sentiment, and how they are combined to yield the LSTM's final prediction. Using the phrase-level labels in SST, we also demonstrate that CD is able to successfully extract positive and negative negations from an LSTM, something which has not previously been done.", "keywords": "interpretability;LSTM;natural language processing;sentiment analysis;interactions", "primary_area": "", "supplementary_material": "", "author": "W. James Murdoch;Peter J. Liu;Bin Yu", "authorids": "jmurdoch@berkeley.edu;peterjliu@google.com;binyu@berkeley.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\njames2018beyond,\ntitle={Beyond Word Importance: Contextual Decomposition to Extract Interactions from {LSTM}s},\nauthor={W. James Murdoch and Peter J. Liu and Bin Yu},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rkRwGg-0Z},\n}", "github": "[![github](/images/github_icon.svg) jamie-murdoch/ContextualDecomposition](https://github.com/jamie-murdoch/ContextualDecomposition) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=rkRwGg-0Z)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;4;2", "rating_avg": 7.0, "confidence_avg": 3.0, "replies_avg": 15, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 272, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9223539489272553209&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=rkRwGg-0Z", "pdf": "https://openreview.net/pdf?id=rkRwGg-0Z", "email": ";;", "author_num": 3 }, { "id": "rkTBjG-AZ", "title": "DeepArchitect: Automatically Designing and Training Deep Architectures", "track": "main", "status": "Reject", "tldr": "We describe a modular and composable language for describing expressive search spaces over architectures and simple model search algorithms applied to these search spaces. ", "abstract": "In deep learning, performance is strongly affected by the choice of architecture\nand hyperparameters. While there has been extensive work on automatic hyperpa-\nrameter optimization for simple spaces, complex spaces such as the space of deep\narchitectures remain largely unexplored. As a result, the choice of architecture is\ndone manually by the human expert through a slow trial and error process guided\nmainly by intuition. In this paper we describe a framework for automatically\ndesigning and training deep models. We propose an extensible and modular lan-\nguage that allows the human expert to compactly represent complex search spaces\nover architectures and their hyperparameters. The resulting search spaces are tree-\nstructured and therefore easy to traverse. Models can be automatically compiled to\ncomputational graphs once values for all hyperparameters have been chosen. We\ncan leverage the structure of the search space to introduce different model search\nalgorithms, such as random search, Monte Carlo tree search (MCTS), and sequen-\ntial model-based optimization (SMBO). We present experiments comparing the\ndifferent algorithms on CIFAR-10 and show that MCTS and SMBO outperform\nrandom search. We also present experiments on MNIST, showing that the same\nsearch space achieves near state-of-the-art performance with a few samples. These\nexperiments show that our framework can be used effectively for model discov-\nery, as it is possible to describe expressive search spaces and discover competitive\nmodels without much effort from the human expert. Code for our framework and\nexperiments has been made publicly available", "keywords": "architecture search;deep learning;hyperparameter tuning", "primary_area": "", "supplementary_material": "", "author": "Renato Negrinho;Geoff Gordon", "authorids": "negrinho@cs.cmu.edu;ggordon@cs.cmu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nnegrinho2018deeparchitect,\ntitle={DeepArchitect: Automatically Designing and Training Deep Architectures},\nauthor={Renato Negrinho and Geoff Gordon},\nyear={2018},\nurl={https://openreview.net/forum?id=rkTBjG-AZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rkTBjG-AZ", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;5;3", "rating_avg": 4.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 206, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12834072685022358922&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "title": "Boundary Seeking GANs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/151", "id": "rkTS8lZAb", "author_site": "R Devon Hjelm, Athul Paul Jacob, Adam Trischler, Tong Che, Kyunghyun Cho, Yoshua Bengio", "tldr": "We address training GANs with discrete data by formulating a policy gradient that generalizes across f-divergences", "abstract": "Generative adversarial networks are a learning framework that rely on training a discriminator to estimate a measure of difference between a target and generated distributions. GANs, as normally formulated, rely on the generated samples being completely differentiable w.r.t. the generative parameters, and thus do not work for discrete data. We introduce a method for training GANs with discrete data that uses the estimated difference measure from the discriminator to compute importance weights for generated samples, thus providing a policy gradient for training the generator. The importance weights have a strong connection to the decision boundary of the discriminator, and we call our method boundary-seeking GANs (BGANs). We demonstrate the effectiveness of the proposed algorithm with discrete image and character-based natural language generation. In addition, the boundary-seeking objective extends to continuous data, which can be used to improve stability of training, and we demonstrate this on Celeba, Large-scale Scene Understanding (LSUN) bedrooms, and Imagenet without conditioning.", "keywords": "Generative adversarial networks;generative learning;deep learning;neural networks;adversarial learning;discrete data", "primary_area": "", "supplementary_material": "", "author": "R Devon Hjelm;Athul Paul Jacob;Adam Trischler;Gerry Che;Kyunghyun Cho;Yoshua Bengio", "authorids": "erroneus@gmail.com;apjacob@uwaterloo.ca;adam.trischler@microsoft.com;tong.che@umontreal.ca;kyunghyun.cho@nyu.edu;yoshua.bengio@umontreal.ca", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\ndevon2018boundary,\ntitle={Boundary Seeking {GAN}s},\nauthor={R Devon Hjelm and Athul Paul Jacob and Adam Trischler and Gerry Che and Kyunghyun Cho and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rkTS8lZAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "4;7;7", "confidence": "3;3;4", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 16, "authors#_avg": 6, "corr_rating_confidence": 0.5, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "openreview": "https://openreview.net/forum?id=rkTS8lZAb", "pdf": "https://openreview.net/pdf?id=rkTS8lZAb", "email": ";;;;;", "author_num": 6 }, { "id": "rkWN3g-AZ", "title": "XGAN: Unsupervised Image-to-Image Translation for many-to-many Mappings", "track": "main", "status": "Reject", "tldr": "XGAN is an unsupervised model for feature-level image-to-image translation applied to semantic style transfer problems such as the face-to-cartoon task, for which we introduce a new dataset.", "abstract": "Style transfer usually refers to the task of applying color and texture information from a specific style image to a given content image while preserving the structure of the latter. Here we tackle the more generic problem of semantic style transfer: given two unpaired collections of images, we aim to learn a mapping between the corpus-level style of each collection, while preserving semantic content shared across the two domains. We introduce XGAN (\"Cross-GAN\"), a dual adversarial autoencoder, which captures a shared representation of the common domain semantic content in an unsupervised way, while jointly learning the domain-to-domain image translations in both directions. We exploit ideas from the domain adaptation literature and define a semantic consistency loss which encourages the model to preserve semantics in the learned embedding space. We report promising qualitative results for the task of face-to-cartoon translation. The cartoon dataset we collected for this purpose will also be released as a new benchmark for semantic style transfer.", "keywords": "unsupervised;gan;domain adaptation;style transfer;semantic;image translation;dataset", "primary_area": "", "supplementary_material": "", "author": "Amelie Royer;Konstantinos Bousmalis;Stephan Gouws;Fred Bertsch;Inbar Mosseri;Forrester Cole;Kevin Murphy", "authorids": "aroyer@ist.ac.at;konstantinos@google.com;sgouws@google.com;fredbertsch@google.com;inbarm@google.com;fcole@google.com;kpmurphy@google.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nroyer2018xgan,\ntitle={{XGAN}: Unsupervised Image-to-Image Translation for many-to-many Mappings},\nauthor={Amelie Royer and Konstantinos Bousmalis and Stephan Gouws and Fred Bertsch and Inbar Mosseri and Forrester Cole and Kevin Murphy},\nyear={2018},\nurl={https://openreview.net/forum?id=rkWN3g-AZ},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 4 community implementations](https://paperswithcode.com/paper/?openreview=rkWN3g-AZ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rkWN3g-AZ", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;4;3", "rating_avg": 3.6666666666666665, "confidence_avg": 4.0, "replies_avg": 15, "authors#_avg": 7, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 161, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4125056012281736352&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11 }, { "title": "Unsupervised Machine Translation Using Monolingual Corpora Only", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/22", "id": "rkYTTf-AZ", "author_site": "Guillaume Lample, , Ludovic Denoyer, Marc'Aurelio Ranzato", "tldr": "We propose a new unsupervised machine translation model that can learn without using parallel corpora; experimental results show impressive performance on multiple corpora and pairs of languages.", "abstract": "Machine translation has recently achieved impressive performance thanks to recent advances in deep learning and the availability of large-scale parallel corpora. There have been numerous attempts to extend these successes to low-resource language pairs, yet requiring tens of thousands of parallel sentences. In this work, we take this research direction to the extreme and investigate whether it is possible to learn to translate even without any parallel data. We propose a model that takes sentences from monolingual corpora in two different languages and maps them into the same latent space. By learning to reconstruct in both languages from this shared feature space, the model effectively learns to translate without using any labeled data. We demonstrate our model on two widely used datasets and two language pairs, reporting BLEU scores of 32.8 and 15.1 on the Multi30k and WMT English-French datasets, without using even a single parallel sentence at training time.", "keywords": "unsupervised;machine translation;adversarial", "primary_area": "", "supplementary_material": "", "author": "Guillaume Lample;Alexis Conneau;Ludovic Denoyer;Marc'Aurelio Ranzato", "authorids": "glample@fb.com;aconneau@fb.com;ludovic.denoyer@lip6.fr;ranzato@fb.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nlample2018unsupervised,\ntitle={Unsupervised Machine Translation Using Monolingual Corpora Only},\nauthor={Guillaume Lample and Alexis Conneau and Ludovic Denoyer and Marc'Aurelio Ranzato},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rkYTTf-AZ},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 15 community implementations](https://paperswithcode.com/paper/?openreview=rkYTTf-AZ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;5;5", "rating_avg": 7.333333333333333, "confidence_avg": 4.666666666666667, "replies_avg": 15, "authors#_avg": 4, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1332, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=682955820897938264&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=rkYTTf-AZ", "pdf": "https://openreview.net/pdf?id=rkYTTf-AZ", "email": ";;;", "author_num": 4 }, { "id": "rkYgAJWCZ", "title": "One-shot and few-shot learning of word embeddings", "track": "main", "status": "Reject", "tldr": "We highlight a technique by which natural language processing systems can learn a new word from context, allowing them to be much more flexible.", "abstract": "Standard deep learning systems require thousands or millions of examples to learn a concept, and cannot integrate new concepts easily. By contrast, humans have an incredible ability to do one-shot or few-shot learning. For instance, from just hearing a word used in a sentence, humans can infer a great deal about it, by leveraging what the syntax and semantics of the surrounding words tells us. Here, we draw inspiration from this to highlight a simple technique by which deep recurrent networks can similarly exploit their prior knowledge to learn a useful representation for a new word from little data. This could make natural language processing systems much more flexible, by allowing them to learn continually from the new words they encounter.", "keywords": "One-shot learning;embeddings;word embeddings;natural language processing;NLP", "primary_area": "", "supplementary_material": "", "author": "Andrew Kyle Lampinen;James Lloyd McClelland", "authorids": "lampinen@stanford.edu;mcclelland@stanford.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkyle2018oneshot,\ntitle={One-shot and few-shot learning of word embeddings},\nauthor={Andrew Kyle Lampinen and James Lloyd McClelland},\nyear={2018},\nurl={https://openreview.net/forum?id=rkYgAJWCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rkYgAJWCZ", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11710759795435317106&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "title": "Scalable Private Learning with PATE", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/13", "id": "rkZB1XbRZ", "author_site": "Nicolas Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar, Ulfar Erlingsson", "tldr": "", "abstract": "The rapid adoption of machine learning has increased concerns about the privacy implications of machine learning models trained on sensitive data, such as medical records or other personal information. To address those concerns, one promising approach is Private Aggregation of Teacher Ensembles, or PATE, which transfers to a \"student\" model the knowledge of an ensemble of \"teacher\" models, with intuitive privacy provided by training teachers on disjoint data and strong privacy guaranteed by noisy aggregation of teachers\u2019 answers. However, PATE has so far been evaluated only on simple classification tasks like MNIST, leaving unclear its utility when applied to larger-scale learning tasks and real-world datasets.\n\nIn this work, we show how PATE can scale to learning tasks with large numbers of output classes and uncurated, imbalanced training data with errors. For this, we introduce new noisy aggregation mechanisms for teacher ensembles that are more selective and add less noise, and prove their tighter differential-privacy guarantees. Our new mechanisms build on two insights: the chance of teacher consensus is increased by using more concentrated noise and, lacking consensus, no answer need be given to a student. The consensus answers used are more likely to be correct, offer better intuitive privacy, and incur lower-differential privacy cost. Our evaluation shows our mechanisms improve on the original PATE on all measures, and scale to larger tasks with both high utility and very strong privacy (\u03b5 < 1.0).", "keywords": "privacy;differential privacy;machine learning;deep learning", "primary_area": "", "supplementary_material": "", "author": "Nicolas Papernot;Shuang Song;Ilya Mironov;Ananth Raghunathan;Kunal Talwar;Ulfar Erlingsson", "authorids": "ngp5056@cse.psu.edu;shs037@eng.ucsd.edu;mironov@google.com;pseudorandom@google.com;kunal@google.com;ulfar@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\npapernot2018scalable,\ntitle={Scalable Private Learning with {PATE}},\nauthor={Nicolas Papernot and Shuang Song and Ilya Mironov and Ananth Raghunathan and Kunal Talwar and Ulfar Erlingsson},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rkZB1XbRZ},\n}", "github": "[![github](/images/github_icon.svg) tensorflow/privacy](https://github.com/tensorflow/privacy/tree/master/research/pate_2018) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=rkZB1XbRZ)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "1;4;3", "rating_avg": 6.333333333333333, "confidence_avg": 2.6666666666666665, "replies_avg": 7, "authors#_avg": 6, "corr_rating_confidence": 0.1889822365046136, "gs_citation": 809, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15500479304618362745&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=rkZB1XbRZ", "pdf": "https://openreview.net/pdf?id=rkZB1XbRZ", "email": ";;;;;", "author_num": 6 }, { "title": "Ensemble Adversarial Training: Attacks and Defenses", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/157", "id": "rkZvSe-RZ", "author_site": "Florian Tramer, Alexey Kurakin, Nicolas Papernot, Ian Goodfellow, Dan Boneh, Patrick McDaniel", "tldr": "Adversarial training with single-step methods overfits, and remains vulnerable to simple black-box and white-box attacks. We show that including adversarial examples from multiple sources helps defend against black-box attacks.", "abstract": "Adversarial examples are perturbed inputs designed to fool machine learning models. Adversarial training injects such examples into training data to increase robustness. To scale this technique to large datasets, perturbations are crafted using fast single-step methods that maximize a linear approximation of the model's loss.\nWe show that this form of adversarial training converges to a degenerate global minimum, wherein small curvature artifacts near the data points obfuscate a linear approximation of the loss. The model thus learns to generate weak perturbations, rather than defend against strong ones. As a result, we find that adversarial training remains vulnerable to black-box attacks, where we transfer perturbations computed on undefended models, as well as to a powerful novel single-step attack that escapes the non-smooth vicinity of the input data via a small random step.\nWe further introduce Ensemble Adversarial Training, a technique that augments training data with perturbations transferred from other models. On ImageNet, Ensemble Adversarial Training yields models with strong robustness to black-box attacks. In particular, our most robust model won the first round of the NIPS 2017 competition on Defenses against Adversarial Attacks.", "keywords": "Adversarial Examples;Adversarial Training;Attacks;Defenses;ImageNet", "primary_area": "", "supplementary_material": "", "author": "Florian Tram\u00e8r;Alexey Kurakin;Nicolas Papernot;Ian Goodfellow;Dan Boneh;Patrick McDaniel", "authorids": "tramer@cs.stanford.edu;alexey@kurakin.me;ngp5056@cse.psu.edu;goodfellow@google.com;dabo@cs.stanford.edu;mcdaniel@cse.psu.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\ntram\u00e8r2018ensemble,\ntitle={Ensemble Adversarial Training: Attacks and Defenses},\nauthor={Florian Tram\u00e8r and Alexey Kurakin and Nicolas Papernot and Ian Goodfellow and Dan Boneh and Patrick McDaniel},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rkZvSe-RZ},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 11 community implementations](https://paperswithcode.com/paper/?openreview=rkZvSe-RZ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;2;4", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 9, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 3557, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10511209374384426640&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "openreview": "https://openreview.net/forum?id=rkZvSe-RZ", "pdf": "https://openreview.net/pdf?id=rkZvSe-RZ", "email": ";;;;;", "author_num": 6 }, { "id": "rkZzY-lCb", "title": "Feat2Vec: Dense Vector Representation for Data with Arbitrary Features", "track": "main", "status": "Reject", "tldr": "Learn dense vector representations of arbitrary types of features in labeled and unlabeled datasets", "abstract": "Methods that calculate dense vector representations for features in unstructured data\u2014such as words in a document\u2014have proven to be very successful for knowledge representation. We study how to estimate dense representations when multiple feature types exist within a dataset for supervised learning where explicit labels are available, as well as for unsupervised learning where there are no labels. Feat2Vec calculates embeddings for data with multiple feature types enforcing that all different feature types exist in a common space. In the supervised case, we show that our method has advantages over recently proposed methods; such as enabling higher prediction accuracy, and providing a way to avoid the cold-start\nproblem. In the unsupervised case, our experiments suggest that Feat2Vec significantly outperforms existing algorithms that do not leverage the structure of the data. We believe that we are the first to propose a method for learning unsuper vised embeddings that leverage the structure of multiple feature types.", "keywords": "unsupervised learning;supervised learning;knowledge representation;deep learning", "primary_area": "", "supplementary_material": "", "author": "Luis Armona;Jos\u00e9 P. Gonz\u00e1lez-Brenes;Ralph Edezhath", "authorids": "luisarmona@gmail.com;jgonzalez@chegg.com;ralph.angelus@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\narmona2018featvec,\ntitle={Feat2Vec: Dense Vector Representation for Data with Arbitrary Features},\nauthor={Luis Armona and Jos\u00e9 P. Gonz\u00e1lez-Brenes and Ralph Edezhath},\nyear={2018},\nurl={https://openreview.net/forum?id=rkZzY-lCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rkZzY-lCb", "pdf_size": 0, "rating": "2;7;7", "confidence": "2;5;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.7559289460184545, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11836736341260209744&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 2 }, { "id": "rkaT3zWCZ", "title": "Building Generalizable Agents with a Realistic and Rich 3D Environment", "track": "main", "status": "Workshop", "tldr": "", "abstract": "Teaching an agent to navigate in an unseen 3D environment is a challenging task, even in the event of simulated environments. To generalize to unseen environments, an agent needs to be robust to low-level variations (e.g. color, texture, object changes), and also high-level variations (e.g. layout changes of the environment). To improve overall generalization, all types of variations in the environment have to be taken under consideration via different level of data augmentation steps. To this end, we propose House3D, a rich, extensible and efficient environment that contains 45,622 human-designed 3D scenes of visually realistic houses, ranging from single-room studios to multi-storied houses, equipped with a diverse set of fully labeled 3D objects, textures and scene layouts, based on the SUNCG dataset (Song et al., 2017). The diversity in House3D opens the door towards scene-level augmentation, while the label-rich nature of House3D enables us to inject pixel- & task-level augmentations such as domain randomization (Tobin et al., 2017) and multi-task training. Using a subset of houses in House3D, we show that reinforcement learning agents trained with an enhancement of different levels of augmentations perform much better in unseen environments than our baselines with raw RGB input by over 8% in terms of navigation success rate. House3D is publicly available at http://github.com/facebookresearch/House3D.", "keywords": "reinforcement learning;generalization;navigation;3D scenes", "primary_area": "", "supplementary_material": "", "author": "Yi Wu;Yuxin Wu;Georgia Gkioxari;Yuandong Tian", "authorids": "jxwuyi@gmail.com;ppwwyyxxc@gmail.com;georgia.gkioxari@gmail.com;yuandong.tian@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nwu2018building,\ntitle={Building Generalizable Agents with a Realistic and Rich 3D Environment},\nauthor={Yi Wu and Yuxin Wu and Georgia Gkioxari and Yuandong Tian},\nyear={2018},\nurl={https://openreview.net/forum?id=rkaT3zWCZ},\n}", "github": "[![github](/images/github_icon.svg) facebookresearch/House3D](https://github.com/facebookresearch/House3D) + [![Papers with Code](/images/pwc_icon.svg) 4 community implementations](https://paperswithcode.com/paper/?openreview=rkaT3zWCZ)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rkaT3zWCZ", "pdf_size": 0, "rating": "4;5;8", "confidence": "5;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 14, "authors#_avg": 4, "corr_rating_confidence": -0.6933752452815364, "gs_citation": 279, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4231304732134350725&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "rkaqxm-0b", "title": "Neural Compositional Denotational Semantics for Question Answering", "track": "main", "status": "Reject", "tldr": "We describe an end-to-end differentiable model for QA that learns to represent spans of text in the question as denotations in knowledge graph, by learning both neural modules for composition and the syntactic structure of the sentence.", "abstract": "Answering compositional questions requiring multi-step reasoning is challenging for current models. We introduce an end-to-end differentiable model for interpreting questions, which is inspired by formal approaches to semantics. Each span of text is represented by a denotation in a knowledge graph, together with a vector that captures ungrounded aspects of meaning. Learned composition modules recursively combine constituents, culminating in a grounding for the complete sentence which is an answer to the question. For example, to interpret \u2018not green\u2019, the model will represent \u2018green\u2019 as a set of entities, \u2018not\u2019 as a trainable ungrounded vector, and then use this vector to parametrize a composition function to perform a complement operation. For each sentence, we build a parse chart subsuming all possible parses, allowing the model to jointly learn both the composition operators and output structure by gradient descent. We show the model can learn to represent a variety of challenging semantic operators, such as quantifiers, negation, disjunctions and composed relations on a synthetic question answering task. The model also generalizes well to longer sentences than seen in its training data, in contrast to LSTM and RelNet baselines. We will release our code.", "keywords": "question answering;knowledge graph;compositional model;semantics", "primary_area": "", "supplementary_material": "", "author": "Nitish Gupta;Mike Lewis", "authorids": "nitishg@cis.upenn.edu;mikelewis@facebook.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ngupta2018neural,\ntitle={Neural Compositional Denotational Semantics for Question Answering},\nauthor={Nitish Gupta and Mike Lewis},\nyear={2018},\nurl={https://openreview.net/forum?id=rkaqxm-0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rkaqxm-0b", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17413121084140207591&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "title": "Variational image compression with a scale hyperprior", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/48", "id": "rkcQFMZRb", "author_site": "Johannes Ball\u00e9, David Minnen, Saurabh Singh, Sung Jin Hwang, Nick Johnston", "tldr": "", "abstract": "We describe an end-to-end trainable model for image compression based on variational autoencoders. The model incorporates a hyperprior to effectively capture spatial dependencies in the latent representation. This hyperprior relates to side information, a concept universal to virtually all modern image codecs, but largely unexplored in image compression using artificial neural networks (ANNs). Unlike existing autoencoder compression methods, our model trains a complex prior jointly with the underlying autoencoder. We demonstrate that this model leads to state-of-the-art image compression when measuring visual quality using the popular MS-SSIM index, and yields rate--distortion performance surpassing published ANN-based methods when evaluated using a more traditional metric based on squared error (PSNR). Furthermore, we provide a qualitative comparison of models trained for different distortion metrics.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Johannes Ball\u00e9;David Minnen;Saurabh Singh;Sung Jin Hwang;Nick Johnston", "authorids": "jballe@google.com;dminnen@google.com;saurabhsingh@google.com;sjhwang@google.com;nickj@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nball\u00e92018variational,\ntitle={Variational image compression with a scale hyperprior},\nauthor={Johannes Ball\u00e9 and David Minnen and Saurabh Singh and Sung Jin Hwang and Nick Johnston},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rkcQFMZRb},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 14 community implementations](https://paperswithcode.com/paper/?openreview=rkcQFMZRb)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;7", "confidence": "5;4;5", "rating_avg": 7.0, "confidence_avg": 4.666666666666667, "replies_avg": 15, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 2215, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17854603515786987053&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=rkcQFMZRb", "pdf": "https://openreview.net/pdf?id=rkcQFMZRb", "email": ";;;;", "author_num": 5 }, { "id": "rkc_hGb0Z", "title": "A dynamic game approach to training robust deep policies", "track": "main", "status": "Reject", "tldr": "This paper demonstrates how H-infinity control theory can help better design robust deep policies for robot motor taks", "abstract": "We present a method for evaluating the sensitivity of deep reinforcement learning (RL) policies. We also formulate a zero-sum dynamic game for designing robust deep reinforcement learning policies. Our approach mitigates the brittleness of policies when agents are trained in a simulated environment and are later exposed to the real world where it is hazardous to employ RL policies. This framework for training deep RL policies involve a zero-sum dynamic game against an adversarial agent, where the goal is to drive the system dynamics to a saddle region. Using a variant of the guided policy search algorithm, our agent learns to adopt robust policies that require less samples for learning the dynamics and performs better than the GPS algorithm. Without loss of generality, we demonstrate that deep RL policies trained in this fashion will be maximally robust to a ``worst\" possible adversarial disturbances.", "keywords": "game-theory;reinforcement-learning;guided-policy-search;dynamic-programming", "primary_area": "", "supplementary_material": "", "author": "Olalekan Ogunmolu", "authorids": "opo140030@utdallas.edu", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nogunmolu2018a,\ntitle={A dynamic game approach to training robust deep policies},\nauthor={Olalekan Ogunmolu},\nyear={2018},\nurl={https://openreview.net/forum?id=rkc_hGb0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rkc_hGb0Z", "pdf_size": 0, "rating": "3;5;5", "confidence": "3;4;2", "rating_avg": 4.333333333333333, "confidence_avg": 3.0, "replies_avg": 5, "authors#_avg": 1, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zboEfs4XYv4J:scholar.google.com/&scioq=A+dynamic+game+approach+to+training+robust+deep+policies&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "rkcya1ZAW", "title": "Continuous-Time Flows for Efficient Inference and Density Estimation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Two fundamental problems in unsupervised learning are efficient inference for latent-variable models and robust density estimation based on large amounts of unlabeled data. For efficient inference, normalizing flows have been recently developed to approximate a target distribution arbitrarily well. In practice, however, normalizing flows only consist of a finite number of deterministic transformations, and thus they possess no guarantee on the approximation accuracy. For density estimation, the generative adversarial network (GAN) has been advanced as an appealing model, due to its often excellent performance in generating samples. In this paper, we propose the concept of {\\em continuous-time flows} (CTFs), a family of diffusion-based methods that are able to asymptotically approach a target distribution. Distinct from normalizing flows and GANs, CTFs can be adopted to achieve the above two goals in one framework, with theoretical guarantees. Our framework includes distilling knowledge from a CTF for efficient inference, and learning an explicit energy-based distribution with CTFs for density estimation. Experiments on various tasks demonstrate promising performance of the proposed CTF framework, compared to related techniques.", "keywords": "continuous-time flows;efficient inference;density estimation;deep generative models", "primary_area": "", "supplementary_material": "", "author": "Changyou Chen;Chunyuan Li;Liqun Chen;Wenlin Wang;Yunchen Pu;Lawrence Carin", "authorids": "cchangyou@gmail.com;chunyuan.li@duke.edu;lc267@duke.edu;wenlin.wang@duke.edu;yunchen.pu@duke.edu;lcarin@duke.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nchen2018continuoustime,\ntitle={Continuous-Time Flows for Efficient Inference and Density Estimation},\nauthor={Changyou Chen and Chunyuan Li and Liqun Chen and Wenlin Wang and Yunchen Pu and Lawrence Carin},\nyear={2018},\nurl={https://openreview.net/forum?id=rkcya1ZAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rkcya1ZAW", "pdf_size": 0, "rating": "3;6;6", "confidence": "3;4;4", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 6, "corr_rating_confidence": 1.0, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17786542528210483087&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "rkdU7tCaZ", "title": "Dynamic Evaluation of Neural Sequence Models", "track": "main", "status": "Reject", "tldr": "Paper presents dynamic evaluation methodology for adaptive sequence modelling", "abstract": "We present methodology for using dynamic evaluation to improve neural sequence models. Models are adapted to recent history via a gradient descent based mechanism, causing them to assign higher probabilities to re-occurring sequential patterns. Dynamic evaluation outperforms existing adaptation approaches in our comparisons. Dynamic evaluation improves the state-of-the-art word-level perplexities on the Penn Treebank and WikiText-2 datasets to 51.1 and 44.3 respectively, and the state-of-the-art character-level cross-entropies on the text8 and Hutter Prize datasets to 1.19 bits/char and 1.08 bits/char respectively.", "keywords": "sequence modelling;language;recurrent neural networks;adaptation", "primary_area": "", "supplementary_material": "", "author": "Ben Krause;Emmanuel Kahembwe;Iain Murray;Steve Renals", "authorids": "ben.krause@ed.ac.uk;e.kahembwe@ed.ac.uk;i.murray@ed.ac.uk;s.renals@ed.ac.uk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nkrause2018dynamic,\ntitle={Dynamic Evaluation of Neural Sequence Models},\nauthor={Ben Krause and Emmanuel Kahembwe and Iain Murray and Steve Renals},\nyear={2018},\nurl={https://openreview.net/forum?id=rkdU7tCaZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rkdU7tCaZ", "pdf_size": 0, "rating": "3;7;7", "confidence": "3;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 1.0, "gs_citation": 152, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7171182301432620931&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "rkeDJ04Mf", "title": "HyperNetworks with statistical filtering for defending adversarial examples", "track": "main", "status": "Withdraw", "tldr": "We modified the CNN using HyperNetworks and observed better robustness against adversarial examples.", "abstract": "Deep learning algorithms have been known to be vulnerable to adversarial perturbations in various tasks such as image classification. This problem was addressed by employing several defense methods for detection and rejection of particular types of attacks. However, training and manipulating networks according to particular defense schemes increases computational complexity of the learning algorithms. In this work, we propose a simple yet effective method to improve robustness of convolutional neural networks (CNNs) to adversarial attacks by using data dependent adaptive convolution kernels. To this end, we propose a new type of HyperNetwork in order to employ statistical properties of input data and features for computation of statistical adaptive maps. Then, we filter convolution weights of CNNs with the learned statistical maps to compute dynamic kernels. Thereby, weights and kernels are collectively optimized for learning of image classification models robust to\nadversarial attacks without employment of additional target detection and rejection algorithms.\nWe empirically demonstrate that the proposed method enables CNNs to spontaneously defend against different types of attacks, e.g. attacks generated by Gaussian noise, fast gradient sign methods (Goodfellow et al., 2014) and a black-box attack (Narodytska & Kasiviswanathan, 2016).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anonymous", "authorids": "ICLR.cc/2018/Conference/Paper293/Authors", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@article{\n anonymous2018hypernetworks,\n title={HyperNetworks with statistical filtering for defending adversarial examples},\n author={Anonymous},\n journal={International Conference on Learning Representations},\n year={2018},\n url={https://openreview.net/forum?id=BJaKwvg0Z}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rkeDJ04Mf", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 3, "authors#_avg": 1, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15433311290764737562&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "rkeZRGbRW", "title": "Variance Regularizing Adversarial Learning", "track": "main", "status": "Reject", "tldr": "We introduce meta-adversarial learning, a new technique to regularize GANs, and propose a training method by explicitly controlling the discriminator's output distribution.", "abstract": "We study how, in generative adversarial networks, variance in the discriminator's output affects the generator's ability to learn the data distribution. In particular, we contrast the results from various well-known techniques for training GANs when the discriminator is near-optimal and updated multiple times per update to the generator. As an alternative, we propose an additional method to train GANs by explicitly modeling the discriminator's output as a bi-modal Gaussian distribution over the real/fake indicator variables. In order to do this, we train the Gaussian classifier to match the target bi-modal distribution implicitly through meta-adversarial training. We observe that our new method, when trained together with a strong discriminator, provides meaningful, non-vanishing gradients.", "keywords": "Generative Adversarial Network;Integral Probability Metric;Meta-Adversarial Learning", "primary_area": "", "supplementary_material": "", "author": "Karan Grewal;R Devon Hjelm;Yoshua Bengio", "authorids": "karanraj.grewal@mail.utoronto.ca;erroneus@gmail.com;yoshua.umontreal@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ngrewal2018variance,\ntitle={Variance Regularizing Adversarial Learning},\nauthor={Karan Grewal and R Devon Hjelm and Yoshua Bengio},\nyear={2018},\nurl={https://openreview.net/forum?id=rkeZRGbRW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rkeZRGbRW", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;3", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16924680375162979391&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "title": "Memory-based Parameter Adaptation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/60", "id": "rkfOvGbCW", "author_site": "Pablo Sprechmann, Siddhant Jayakumar, Jack Rae, Alexander Pritzel, Adria Puigdomenech Badia, Benigno Uria, Oriol Vinyals, Demis Hassabis, Razvan Pascanu, Charles Blundell", "tldr": "", "abstract": " Deep neural networks have excelled on a wide range of problems, from vision to language and game playing. Neural networks very gradually incorporate information into weights as they process data, requiring very low learning rates. If the training distribution shifts, the network is slow to adapt, and when it does adapt, it typically performs badly on the training distribution before the shift. Our method, Memory-based Parameter Adaptation, stores examples in memory and then uses a context-based lookup to directly modify the weights of a neural network. Much higher learning rates can be used for this local adaptation, reneging the need for many iterations over similar data before good predictions can be made. As our method is memory-based, it alleviates several shortcomings of neural networks, such as catastrophic forgetting, fast, stable acquisition of new knowledge, learning with an imbalanced class labels, and fast learning during evaluation. We demonstrate this on a range of supervised tasks: large-scale image classification and language modelling.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pablo Sprechmann;Siddhant M. Jayakumar;Jack W. Rae;Alexander Pritzel;Adria Puigdomenech Badia;Benigno Uria;Oriol Vinyals;Demis Hassabis;Razvan Pascanu;Charles Blundell", "authorids": "psprechmann@google.com;sidmj@google.com;jwrae@google.com;apritzel@google.com;adriap@google.com;buria@google.com;vinyals@google.com;dhcontact@google.com;razp@google.com;cblundell@google.com", "gender": ";;;;;;;;;", "homepage": ";;;;;;;;;", "dblp": ";;;;;;;;;", "google_scholar": ";;;;;;;;;", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;;;", "or_profile": ";;;;;;;;;", "aff": ";;;;;;;;;", "aff_domain": ";;;;;;;;;", "position": ";;;;;;;;;", "bibtex": "@inproceedings{\nsprechmann2018memorybased,\ntitle={Memory-based Parameter Adaptation},\nauthor={Pablo Sprechmann and Siddhant Jayakumar and Jack Rae and Alexander Pritzel and Adria Puigdomenech Badia and Benigno Uria and Oriol Vinyals and Demis Hassabis and Razvan Pascanu and Charles Blundell},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rkfOvGbCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;8", "confidence": "4;5;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 19, "authors#_avg": 10, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 121, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13158888045275274984&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=rkfOvGbCW", "pdf": "https://openreview.net/pdf?id=rkfOvGbCW", "email": ";;;;;;;;;", "author_num": 10 }, { "id": "rkfbLilAb", "title": "Improving Search Through A3C Reinforcement Learning Based Conversational Agent", "track": "main", "status": "Reject", "tldr": "A Reinforcement Learning based conversational search assistant which provides contextual assistance in subjective search (like digital assets).", "abstract": "We develop a reinforcement learning based search assistant which can assist users through a set of actions and sequence of interactions to enable them realize their intent. Our approach caters to subjective search where the user is seeking digital assets such as images which is fundamentally different from the tasks which have objective and limited search modalities. Labeled conversational data is generally not available in such search tasks and training the agent through human interactions can be time consuming. We propose a stochastic virtual user which impersonates a real user and can be used to sample user behavior efficiently to train the agent which accelerates the bootstrapping of the agent. We develop A3C algorithm based context preserving architecture which enables the agent to provide contextual assistance to the user. We compare the A3C agent with Q-learning and evaluate its performance on average rewards and state values it obtains with the virtual user in validation episodes. Our experiments show that the agent learns to achieve higher rewards and better states.", "keywords": "Subjective search;Reinforcement Learning;Conversational Agent;Virtual user model;A3C;Context aggregation", "primary_area": "", "supplementary_material": "", "author": "Milan Aggarwal;Aarushi Arora;Shagun Sodhani;Balaji Krishnamurthy", "authorids": "milan.ag1994@gmail.com;aarushi.arora043@gmail.com;sshagunsodhani@gmail.com;kbalaji@adobe.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\naggarwal2018improving,\ntitle={Improving Search Through A3C Reinforcement Learning Based Conversational Agent},\nauthor={Milan Aggarwal and Aarushi Arora and Shagun Sodhani and Balaji Krishnamurthy},\nyear={2018},\nurl={https://openreview.net/forum?id=rkfbLilAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rkfbLilAb", "pdf_size": 0, "rating": "2;3;5", "confidence": "5;5;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.666666666666667, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": -0.944911182523068, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7579008030519096125&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "title": "Neural Language Modeling by Jointly Learning Syntax and Lexicon", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/124", "id": "rkgOLb-0W", "author_site": "Yikang Shen, Zhouhan Lin, Chin-Wei Huang, Aaron Courville", "tldr": "In this paper, We propose a novel neural language model, called the Parsing-Reading-Predict Networks (PRPN), that can simultaneously induce the syntactic structure from unannotated sentences and leverage the inferred structure to learn a better language model.", "abstract": "We propose a neural language model capable of unsupervised syntactic structure induction. The model leverages the structure information to form better semantic representations and better language modeling. Standard recurrent neural networks are limited by their structure and fail to efficiently use syntactic information. On the other hand, tree-structured recursive networks usually require additional structural supervision at the cost of human expert annotation. In this paper, We propose a novel neural language model, called the Parsing-Reading-Predict Networks (PRPN), that can simultaneously induce the syntactic structure from unannotated sentences and leverage the inferred structure to learn a better language model. In our model, the gradient can be directly back-propagated from the language model loss into the neural parsing network. Experiments show that the proposed model can discover the underlying syntactic structure and achieve state-of-the-art performance on word/character-level language model tasks.", "keywords": "Language model;unsupervised parsing", "primary_area": "", "supplementary_material": "", "author": "Yikang Shen;Zhouhan Lin;Chin-wei Huang;Aaron Courville", "authorids": "yikang.shn@gmail.com;lin.zhouhan@gmail.com;chin-wei.huang@umontreal.ca;aaron.courville@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nshen2018neural,\ntitle={Neural Language Modeling by Jointly Learning Syntax and Lexicon},\nauthor={Yikang Shen and Zhouhan Lin and Chin-wei Huang and Aaron Courville},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rkgOLb-0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "7;7;8", "confidence": "3;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 18, "authors#_avg": 4, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 208, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15172031143397580163&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=rkgOLb-0W", "pdf": "https://openreview.net/pdf?id=rkgOLb-0W", "email": ";;;", "author_num": 4 }, { "id": "rkh-agjMG", "title": "Learning to Imagine Manipulation Goals for Robot Task Planning", "track": "main", "status": "Withdraw", "tldr": "We describe an architecture for generating diverse hypotheses for intermediate goals during robotic manipulation tasks.", "abstract": "Prospection is an important part of how humans come up with new task plans, but has not been explored in depth in robotics. Predicting multiple task-level is a challenging problem that involves capturing both task semantics and continuous variability over the state of the world. Ideally, we would combine the ability of machine learning to leverage big data for learning the semantics of a task, while using techniques from task planning to reliably generalize to new environment. In this work, we propose a method for learning a model encoding just such a representation for task planning. We learn a neural net that encodes the k most likely outcomes from high level actions from a given world. Our approach creates comprehensible task plans that allow us to predict changes to the environment many time steps into the future. We demonstrate this approach via application to a stacking task in a cluttered environment, where the robot must select between different colored blocks while avoiding obstacles, in order to perform a task. We also show results on a simple navigation task. Our algorithm generates realistic image and pose predictions at multiple points in a given task.\n", "keywords": "deep learning;planning;prediction;generative models", "primary_area": "", "supplementary_material": "", "author": "Chris Paxton;Kapil Katyal;Christian Rupprecht;Raman Arora;Gregory D Hager", "authorids": "cpaxton@jhu.edu;kkatyal2@jhu.edu;christian.rupprecht@in.tum.de;arora@cs.jhu.edu;hager@cs.jhu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rkh-agjMG", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;4;3", "rating_avg": 3.0, "confidence_avg": 3.3333333333333335, "replies_avg": 4, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17607927799843097302&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "rkhCSO4T-", "title": "Distributed non-parametric deep and wide networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "In recent work, it was shown that combining multi-kernel based support vector machines (SVMs) can lead to near state-of-the-art performance on an action recognition dataset (HMDB-51 dataset). In the present work, we show that combining distributed Gaussian Processes with multi-stream deep convolutional neural networks (CNN) alleviate the need to augment a neural network with hand-crafted features. In contrast to prior work, we treat each deep neural convolutional network as an expert wherein the individual predictions (and their respective uncertainties) are combined into a Product of Experts (PoE) framework.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Biswa Sengupta;Yu Qian", "authorids": "biswasengupta@yahoo.com;yu.qian@cortexica.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nsengupta2018distributed,\ntitle={Distributed non-parametric deep and wide networks},\nauthor={Biswa Sengupta and Yu Qian},\nyear={2018},\nurl={https://openreview.net/forum?id=rkhCSO4T-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=rkhCSO4T-", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;5", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:i1HYRfaTzhoJ:scholar.google.com/&scioq=Distributed+non-parametric+deep+and+wide+networks&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Wavelet Pooling for Convolutional Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/262", "id": "rkhlb8lCZ", "author_site": "Travis Williams, Robert Li", "tldr": "Pooling is achieved using wavelets instead of traditional neighborhood approaches (max, average, etc).", "abstract": "Convolutional Neural Networks continuously advance the progress of 2D and 3D image and object classification. The steadfast usage of this algorithm requires constant evaluation and upgrading of foundational concepts to maintain progress. Network regularization techniques typically focus on convolutional layer operations, while leaving pooling layer operations without suitable options. We introduce Wavelet Pooling as another alternative to traditional neighborhood pooling. This method decomposes features into a second level decomposition, and discards the first-level subbands to reduce feature dimensions. This method addresses the overfitting problem encountered by max pooling, while reducing features in a more structurally compact manner than pooling via neighborhood regions. Experimental results on four benchmark classification datasets demonstrate our proposed method outperforms or performs comparatively with methods like max, mean, mixed, and stochastic pooling. ", "keywords": "Pooling;Wavelet;CNN;Neural Network;Deep Learning;Classification;Machine Learning;Object Recognition", "primary_area": "", "supplementary_material": "", "author": "Travis Williams;Robert Li", "authorids": "tlwilli3@aggies.ncat.edu;eeli@ncat.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nwilliams2018wavelet,\ntitle={Wavelet Pooling for Convolutional Neural Networks},\nauthor={Travis Williams and Robert Li},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rkhlb8lCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "4;7;9", "confidence": "4;4;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": -0.8029550685469661, "gs_citation": 285, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14408611586003021484&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=rkhlb8lCZ", "pdf": "https://openreview.net/pdf?id=rkhlb8lCZ", "email": ";", "author_num": 2 }, { "id": "rkhxwltab", "title": "AANN: Absolute Artificial Neural Network", "track": "main", "status": "Reject", "tldr": "Tied weights auto-encoder with abs function as activation function, learns to do classification in the forward direction and regression in the backward direction due to specially defined cost function.", "abstract": "This research paper describes a simplistic architecture named as AANN: Absolute Artificial Neural Network, which can be used to create highly interpretable representations of the input data. These representations are generated by penalizing the learning of the network in such a way that those learned representations correspond to the respective labels present in the labelled dataset used for supervised training; thereby, simultaneously giving the network the ability to classify the input data. The network can be used in the reverse direction to generate data that closely resembles the input by feeding in representation vectors as required. This research paper also explores the use of mathematical abs (absolute valued) functions as activation functions which constitutes the core part of this neural network architecture. Finally the results obtained on the MNIST dataset by using this technique are presented and discussed in brief.", "keywords": "Neural Network architecture;Learned representation space;absolute valued function;bidirectional neuron", "primary_area": "", "supplementary_material": "", "author": "Animesh Karnewar", "authorids": "animeshsk3@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nkarnewar2018aann,\ntitle={{AANN}: Absolute Artificial Neural Network},\nauthor={Animesh Karnewar},\nyear={2018},\nurl={https://openreview.net/forum?id=rkhxwltab},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rkhxwltab", "pdf_size": 0, "rating": "2;3;6", "confidence": "3;5;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 1, "corr_rating_confidence": 0.24019223070763066, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3248480733316460559&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "rkmoiMbCb", "title": "Tandem Blocks in Deep Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "We generalize residual blocks to tandem blocks, which use arbitrary linear maps instead of shortcuts, and improve performance over ResNets.", "abstract": "Due to the success of residual networks (resnets) and related architectures, shortcut connections have quickly become standard tools for building convolutional neural networks. The explanations in the literature for the apparent effectiveness of shortcuts are varied and often contradictory. We hypothesize that shortcuts work primarily because they act as linear counterparts to nonlinear layers. We test this hypothesis by using several variations on the standard residual block, with different types of linear connections, to build small (100k--1.2M parameter) image classification networks. Our experiments show that other kinds of linear connections can be even more effective than the identity shortcuts. Our results also suggest that the best type of linear connection for a given application may depend on both network width and depth.", "keywords": "resnet;residual;shortcut;convolutional;linear;skip;highway", "primary_area": "", "supplementary_material": "", "author": "Chris Hettinger;Tanner Christensen;Jeff Humpherys;Tyler J Jarvis", "authorids": "chrishettinger@gmail.com;tkchristensen@byu.edu;jeffh@math.byu.edu;jarvis@math.byu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nhettinger2018tandem,\ntitle={Tandem Blocks in Deep Convolutional Neural Networks},\nauthor={Chris Hettinger and Tanner Christensen and Jeff Humpherys and Tyler J Jarvis},\nyear={2018},\nurl={https://openreview.net/forum?id=rkmoiMbCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rkmoiMbCb", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 16, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11631495711340790648&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "rkmtTJZCb", "title": "Unsupervised Hierarchical Video Prediction", "track": "main", "status": "Reject", "tldr": "We show ways to train a hierarchical video prediction model without needing pose labels.", "abstract": "Much recent research has been devoted to video prediction and generation, but mostly for short-scale time horizons. The hierarchical video prediction method by Villegas et al. (2017) is an example of a state of the art method for long term video prediction. However, their method has limited applicability in practical settings as it requires a ground truth pose (e.g., poses of joints of a human) at training time. This paper presents a long term hierarchical video prediction model that does not have such a restriction. We show that the network learns its own higher-level structure (e.g., pose equivalent hidden variables) that works better in cases where the ground truth pose does not fully capture all of the information needed to predict the next frame. This method gives sharper results than other video prediction methods which do not require a ground truth pose, and its efficiency is shown on the Humans 3.6M and Robot Pushing datasets.", "keywords": "video prediction;visual analogy network;unsupervised;hierarchical", "primary_area": "", "supplementary_material": "", "author": "Nevan Wichers;Dumitru Erhan;Honglak Lee", "authorids": "wichersn@google.com;dumitru@google.com;honglak@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nwichers2018unsupervised,\ntitle={Unsupervised Hierarchical Video Prediction},\nauthor={Nevan Wichers and Dumitru Erhan and Honglak Lee},\nyear={2018},\nurl={https://openreview.net/forum?id=rkmtTJZCb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rkmtTJZCb", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "MGAN: Training Generative Adversarial Nets with Multiple Generators", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/318", "id": "rkmu5b0a-", "author_site": "Quan Hoang, Tu D Nguyen, Trung Le, Dinh Phung", "tldr": "We propose a new approach to train GANs with a mixture of generators to overcome the mode collapsing problem.", "abstract": "We propose in this paper a new approach to train the Generative Adversarial Nets (GANs) with a mixture of generators to overcome the mode collapsing problem. The main intuition is to employ multiple generators, instead of using a single one as in the original GAN. The idea is simple, yet proven to be extremely effective at covering diverse data modes, easily overcoming the mode collapsing problem and delivering state-of-the-art results. A minimax formulation was able to establish among a classifier, a discriminator, and a set of generators in a similar spirit with GAN. Generators create samples that are intended to come from the same distribution as the training data, whilst the discriminator determines whether samples are true data or generated by generators, and the classifier specifies which generator a sample comes from. The distinguishing feature is that internal samples are created from multiple generators, and then one of them will be randomly selected as final output similar to the mechanism of a probabilistic mixture model. We term our method Mixture Generative Adversarial Nets (MGAN). We develop theoretical analysis to prove that, at the equilibrium, the Jensen-Shannon divergence (JSD) between the mixture of generators\u2019 distributions and the empirical data distribution is minimal, whilst the JSD among generators\u2019 distributions is maximal, hence effectively avoiding the mode collapsing problem. By utilizing parameter sharing, our proposed model adds minimal computational cost to the standard GAN, and thus can also efficiently scale to large-scale datasets. We conduct extensive experiments on synthetic 2D data and natural image databases (CIFAR-10, STL-10 and ImageNet) to demonstrate the superior performance of our MGAN in achieving state-of-the-art Inception scores over latest baselines, generating diverse and appealing recognizable objects at different resolutions, and specializing in capturing different types of objects by the generators.", "keywords": "GANs;Mode Collapse;Mixture;Jensen-Shannon Divergence;Inception Score;Generator;Discriminator;CIFAR-10;STL-10;ImageNet", "primary_area": "", "supplementary_material": "", "author": "Quan Hoang;Tu Dinh Nguyen;Trung Le;Dinh Phung", "authorids": "qhoang@umass.edu;tu.nguyen@deakin.edu.au;trung.l@deakin.edu.au;dinh.phung@deakin.edu.au", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nhoang2018mgan,\ntitle={{MGAN}: Training Generative Adversarial Nets with Multiple Generators},\nauthor={Quan Hoang and Tu Dinh Nguyen and Trung Le and Dinh Phung},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rkmu5b0a-},\n}", "github": "[![github](/images/github_icon.svg) qhoangdl/MGAN](https://github.com/qhoangdl/MGAN)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;3;5", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 29, "authors#_avg": 4, "corr_rating_confidence": 0.5, "gs_citation": 287, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15083973924521420990&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=rkmu5b0a-", "pdf": "https://openreview.net/pdf?id=rkmu5b0a-", "email": ";;;", "author_num": 4 }, { "id": "rknJHfXBz", "title": "Empirical Investigation on Model Capacity and Generalization of Neural Networks for Text", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recently, deep neural network models have shown promising opportunities for many natural language processing (NLP) tasks. In practice, the number of parameters of deep neural models is often significantly larger than the size of the training set, and its generalization behavior cannot be explained by the classic generalization theory. In this paper, with extensive experiments, we empirically investigate the model capacity and generalization of neural models for text. The experiments show that deep neural models can find patterns better than brute-force memorization. Therefore, a large-capacity model with early-stopping stochastic gradient descent (SGD) as implicit regularizer seems to be the best choice, as it has better generalization ability and higher convergence speed.", "keywords": "Text;Empirical Investigation;Model Capacity;Generalization Ability;Neural Networks;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Anonymous", "authorids": "ICLR.cc/2018/Conference/Paper265/Authors", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@article{\n anonymous2018empirical,\n title={Empirical Investigation on Model Capacity and Generalization of Neural Networks for Text},\n author={Anonymous},\n journal={International Conference on Learning Representations},\n year={2018},\n url={https://openreview.net/forum?id=Hyz66BxCW}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rknJHfXBz", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;5;5", "rating_avg": 3.6666666666666665, "confidence_avg": 4.666666666666667, "replies_avg": 3, "authors#_avg": 1, "corr_rating_confidence": 0.9999999999999997, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "Compositional Obverter Communication Learning from Raw Visual Input", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/265", "id": "rknt2Be0-", "author_site": "Edward Choi, Angeliki Lazaridou, Nando de Freitas", "tldr": "We train neural network agents to develop a language with compositional properties from raw pixel input.", "abstract": "One of the distinguishing aspects of human language is its compositionality, which allows us to describe complex environments with limited vocabulary. Previously, it has been shown that neural network agents can learn to communicate in a highly structured, possibly compositional language based on disentangled input (e.g. hand- engineered features). Humans, however, do not learn to communicate based on well-summarized features. In this work, we train neural agents to simultaneously develop visual perception from raw image pixels, and learn to communicate with a sequence of discrete symbols. The agents play an image description game where the image contains factors such as colors and shapes. We train the agents using the obverter technique where an agent introspects to generate messages that maximize its own understanding. Through qualitative analysis, visualization and a zero-shot test, we show that the agents can develop, out of raw image pixels, a language with compositional properties, given a proper pressure from the environment.", "keywords": "compositional language;obverter;multi-agent communication;raw pixel input", "primary_area": "", "supplementary_material": "", "author": "Edward Choi;Angeliki Lazaridou;Nando de Freitas", "authorids": "mp2893@gatech.edu;angeliki@google.com;nandodefreitas@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nchoi2018multiagent,\ntitle={Multi-Agent Compositional Communication Learning from Raw Visual Input},\nauthor={Edward Choi and Angeliki Lazaridou and Nando de Freitas},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rknt2Be0-},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=rknt2Be0-)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "3;6;9", "confidence": "4;3;4", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 93, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11340269692127577649&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=rknt2Be0-", "pdf": "https://openreview.net/pdf?id=rknt2Be0-", "email": ";;", "author_num": 3 }, { "id": "rko6pIcRZ", "title": "The Multilinear Structure of ReLU Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We study the loss surface of neural networks that involve only rectified linear unit (ReLU) nonlinearities from a theoretical point-of-view. Any such network defines a piecewise multilinear form in parameter space. As a consequence, optima of such networks generically occur in non-differentiable regions of parameter space and so any understanding of such networks must carefully take into account their non-smooth nature. We then proceed to leverage this multilinear structure in an analysis of a neural network with one hidden-layer. Under the assumption of linearly separable data, the piecewise bilinear structure of the loss allows us to provide an explicit description of all critical points.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Thomas Laurent;James von Brecht", "authorids": "tlaurent@lmu.edu;james.vonbrecht@csulb.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=rko6pIcRZ", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 0, "authors#_avg": 2, "corr_rating_confidence": 0, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9309445801242373392&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "title": "Self-ensembling for visual domain adaptation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/207", "id": "rkpoTaxA-", "author_site": "Geoff W French, Michal Mackiewicz, Mark Fisher", "tldr": "Self-ensembling based algorithm for visual domain adaptation, state of the art results, won VisDA-2017 image classification domain adaptation challenge.", "abstract": "This paper explores the use of self-ensembling for visual domain adaptation problems. Our technique is derived from the mean teacher variant (Tarvainen et. al 2017) of temporal ensembling (Laine et al. 2017), a technique that achieved state of the art results in the area of semi-supervised learning. We introduce a number of modifications to their approach for challenging domain adaptation scenarios and evaluate its effectiveness. Our approach achieves state of the art results in a variety of benchmarks, including our winning entry in the VISDA-2017 visual domain adaptation challenge. In small image benchmarks, our algorithm not only outperforms prior art, but can also achieve accuracy that is close to that of a classifier trained in a supervised fashion.", "keywords": "deep learning;neural networks;domain adaptation;images;visual;computer vision", "primary_area": "", "supplementary_material": "", "author": "Geoff French;Michal Mackiewicz;Mark Fisher", "authorids": "g.french@uea.ac.uk;m.mackiewicz@uea.ac.uk;m.fisher@uea.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nfrench2018selfensembling,\ntitle={Self-ensembling for visual domain adaptation},\nauthor={Geoff French and Michal Mackiewicz and Mark Fisher},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rkpoTaxA-},\n}", "github": "[![github](/images/github_icon.svg) Britefury/self-ensemble-visual-domain-adapt](https://github.com/Britefury/self-ensemble-visual-domain-adapt) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=rkpoTaxA-)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;5;3", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 690, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9203351470159334271&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 17, "openreview": "https://openreview.net/forum?id=rkpoTaxA-", "pdf": "https://openreview.net/pdf?id=rkpoTaxA-", "email": ";;", "author_num": 3 }, { "id": "rkpqdGDeM", "title": "Sparse Deep Scattering Crois\u00e9 Network", "track": "main", "status": "Withdraw", "tldr": "We propose to enhance the Deep Scattering Network in order to improve control and stability of any given machine learning pipeline by proposing a continuous wavelet thresholding scheme", "abstract": "In this work, we propose the Sparse Deep Scattering Crois\u00e9 Network (SDCSN) a novel architecture based on the Deep Scattering Network (DSN). The DSN is achieved by cascading wavelet transform convolutions with a complex modulus and a time-invariant operator. We extend this work by first,\ncrossing multiple wavelet family transforms to increase the feature diversity while avoiding any learning. Thus providing a more informative latent representation and benefit from the development of highly specialized wavelet filters over the last decades. Beside, by combining all the different wavelet representations, we reduce the amount of prior information needed regarding the signals at hand.\nSecondly, we develop an optimal thresholding strategy for over-complete filter banks that regularizes the network and controls instabilities such as inherent non-stationary noise in the signal. Our systematic and principled solution sparsifies the latent representation of the network by acting as a local mask distinguishing between activity and noise. Thus, we propose to enhance the DSN by increasing the variance of the scattering coefficients representation as well as improve its robustness with respect to non-stationary noise.\nWe show that our new approach is more robust and outperforms the DSN on a bird detection task.", "keywords": "Deep Scattering Network;Continuous Wavelet Thresholding;Sparse Activations;Time-frequency represenation;Multi-Family;Wavelets;Convolutional Network;Bird Detection", "primary_area": "", "supplementary_material": "", "author": "Romain Cosentino;Randall Balestriero;Richard Baraniuk;Ankit Patel", "authorids": "rom.cosentino@gmail.com;randallbalestriero@gmail.com;ankitpatel715@gmail.com;baraniuk@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2", "site": "https://openreview.net/forum?id=rkpqdGDeM", "pdf_size": 0, "rating": "6", "confidence": "4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 1, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:fGvtLqd7eYEJ:scholar.google.com/&scioq=Sparse+Deep+Scattering+Crois%C3%A9+Network&hl=en&as_sdt=0,5", "gs_version_total": 2 }, { "title": "Large scale distributed neural network training through online distillation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/255", "id": "rkr1UDeC-", "author_site": "Rohan Anil, Gabriel Pereyra, Alexandre Tachard Passos, Robert Ormandi, George Dahl, Geoffrey E Hinton", "tldr": "We perform large scale experiments to show that a simple online variant of distillation can help us scale distributed neural network training to more machines.", "abstract": "Techniques such as ensembling and distillation promise model quality improvements when paired with almost any base model. However, due to increased test-time cost (for ensembles) and increased complexity of the training pipeline (for distillation), these techniques are challenging to use in industrial settings. In this paper we explore a variant of distillation which is relatively straightforward to use as it does not require a complicated multi-stage setup or many new hyperparameters. Our first claim is that online distillation enables us to use extra parallelism to fit very large datasets about twice as fast. Crucially, we can still speed up training even after we have already reached the point at which additional parallelism provides no benefit for synchronous or asynchronous stochastic gradient descent. Two neural networks trained on disjoint subsets of the data can share knowledge by encouraging each model to agree with the predictions the other model would have made. These predictions can come from a stale version of the other model so they can be safely computed using weights that only rarely get transmitted. Our second claim is that online distillation is a cost-effective way to make the exact predictions of a model dramatically more reproducible. We support our claims using experiments on the Criteo Display Ad Challenge dataset, ImageNet, and the largest to-date dataset used for neural language modeling, containing $6\\times 10^{11}$ tokens and based on the Common Crawl repository of web data.", "keywords": "distillation;distributed training;neural networks;deep learning", "primary_area": "", "supplementary_material": "", "author": "Rohan Anil;Gabriel Pereyra;Alexandre Passos;Robert Ormandi;George E. Dahl;Geoffrey E. Hinton", "authorids": "rohananil@google.com;pereyra@google.com;apassos@google.com;ormandi@google.com;gdahl@google.com;geoffhinton@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nanil2018large,\ntitle={Large scale distributed neural network training through online distillation},\nauthor={Rohan Anil and Gabriel Pereyra and Alexandre Passos and Robert Ormandi and George E. Dahl and Geoffrey E. Hinton},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rkr1UDeC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "4;6;8", "confidence": "3;3;4", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 6, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 535, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1698767877858492764&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=rkr1UDeC-", "pdf": "https://openreview.net/pdf?id=rkr1UDeC-", "email": ";;;;;", "author_num": 6 }, { "title": "Learning a Generative Model for Validity in Complex Discrete Structures", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/29", "id": "rkrC3GbRW", "author_site": "David Janz, Jos van der Westhuizen, Brooks Paige, Matt J Kusner, Jos\u00e9 Miguel Hern\u00e1ndez Lobato", "tldr": "", "abstract": "Deep generative models have been successfully used to learn representations for high-dimensional discrete spaces by representing discrete objects as sequences and employing powerful sequence-based deep models. Unfortunately, these sequence-based models often produce invalid sequences: sequences which do not represent any underlying discrete structure; invalid sequences hinder the utility of such models. As a step towards solving this problem, we propose to learn a deep recurrent validator model, which can estimate whether a partial sequence can function as the beginning of a full, valid sequence. This validator provides insight as to how individual sequence elements influence the validity of the overall sequence, and can be used to constrain sequence based models to generate valid sequences \u2014 and thus faithfully model discrete objects. Our approach is inspired by reinforcement learning, where an oracle which can evaluate validity of complete sequences provides a sparse reward signal. We demonstrate its effectiveness as a generative model of Python 3 source code for mathematical expressions, and in improving the ability of a variational autoencoder trained on SMILES strings to decode valid molecular structures.", "keywords": "Active learning;Reinforcement learning;Molecules", "primary_area": "", "supplementary_material": "", "author": "Dave Janz;Jos van der Westhuizen;Brooks Paige;Matt Kusner;Jos\u00e9 Miguel Hern\u00e1ndez-Lobato", "authorids": "david.janz93@gmail.com;josvdwest@gmail.com;tbpaige@gmail.com;matt.kusner@gmail.com;jmh233@cam.ac.uk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\njanz2018learning,\ntitle={Learning a Generative Model for Validity in Complex Discrete Structures},\nauthor={Dave Janz and Jos van der Westhuizen and Brooks Paige and Matt Kusner and Jose Miguel Hernandez Lobato},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rkrC3GbRW},\n}", "github": "[![github](/images/github_icon.svg) DavidJanz/molecule_grammar_rnn](https://github.com/DavidJanz/molecule_grammar_rnn)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;3;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 15, "authors#_avg": 5, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5246820158519363051&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=rkrC3GbRW", "pdf": "https://openreview.net/pdf?id=rkrC3GbRW", "email": ";;;;", "author_num": 5 }, { "id": "rkrWCJWAW", "title": "Unbiasing Truncated Backpropagation Through Time", "track": "main", "status": "Reject", "tldr": "Provides an unbiased version of truncated backpropagation by sampling truncation lengths and reweighting accordingly.", "abstract": "\\emph{Truncated Backpropagation Through Time} (truncated BPTT, \\cite{jaeger2002tutorial}) is a widespread method for learning recurrent computational graphs. Truncated BPTT keeps the computational benefits of \\emph{Backpropagation Through Time} (BPTT \\cite{werbos:bptt}) while relieving the need for a complete backtrack through the whole data sequence at every step. However, truncation favors short-term dependencies: the gradient estimate of truncated BPTT is biased, so that it does not benefit from the convergence guarantees from stochastic gradient theory. We introduce \\emph{Anticipated Reweighted Truncated Backpropagation} (ARTBP), an algorithm that keeps the computational benefits of truncated BPTT, while providing unbiasedness. ARTBP works by using variable truncation lengths together with carefully chosen compensation factors in the backpropagation equation. We check the viability of ARTBP on two tasks. First, a simple synthetic task where careful balancing of temporal dependencies at different scales is needed: truncated BPTT displays unreliable performance, and in worst case scenarios, divergence, while ARTBP converges reliably. Second, on Penn Treebank character-level language modelling \\cite{ptb_proc}, ARTBP slightly outperforms truncated BPTT.\n", "keywords": "RNN", "primary_area": "", "supplementary_material": "", "author": "Corentin Tallec;Yann Ollivier", "authorids": "corentin.tallec@polytechnique.edu;yann@yann-ollivier.org", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntallec2018unbiasing,\ntitle={Unbiasing Truncated Backpropagation Through Time},\nauthor={Corentin Tallec and Yann Ollivier},\nyear={2018},\nurl={https://openreview.net/forum?id=rkrWCJWAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rkrWCJWAW", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 92, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13571113438570907766&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "rkrqgCWMG", "title": "Withdraw", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "withdrawn paper", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Liyuan Liu;Jingbo Shang;Xiaotao Gu;Xiang Ren;Jian Peng;Jiawei Han", "authorids": "ll2@illinois.edu;shang7@illinois.edu;xiaotao2@illinois.du;xiangren@usc.edu;jianpeng@illinois.edu;hanj@illinois.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rkrqgCWMG", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;5;3", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 3, "authors#_avg": 6, "corr_rating_confidence": -0.8660254037844385, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "rkvDssyRb", "title": "Multi-Advisor Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "We consider tackling a single-agent RL problem by distributing it to $n$ learners.", "abstract": "We consider tackling a single-agent RL problem by distributing it to $n$ learners. These learners, called advisors, endeavour to solve the problem from a different focus. Their advice, taking the form of action values, is then communicated to an aggregator, which is in control of the system. We show that the local planning method for the advisors is critical and that none of the ones found in the literature is flawless: the \\textit{egocentric} planning overestimates values of states where the other advisors disagree, and the \\textit{agnostic} planning is inefficient around danger zones. We introduce a novel approach called \\textit{empathic} and discuss its theoretical aspects. We empirically examine and validate our theoretical findings on a fruit collection task.", "keywords": "Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Romain Laroche;Mehdi Fatemi;Joshua Romoff;Harm van Seijen", "authorids": "romain.laroche@gmail.com;mehdi.fatemi@microsoft.com;joshua.romoff@mail.mcgill.ca;havansei@microsoft.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlaroche2018multiadvisor,\ntitle={Multi-Advisor Reinforcement Learning},\nauthor={Romain Laroche and Mehdi Fatemi and Joshua Romoff and Harm van Seijen},\nyear={2018},\nurl={https://openreview.net/forum?id=rkvDssyRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rkvDssyRb", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;5", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13658155835203023834&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "rkw-jlb0W", "title": "Deep Lipschitz networks and Dudley GANs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generative adversarial networks (GANs) have enjoyed great success, however often suffer instability during training which motivates many attempts to resolve this issue. Theoretical explanation for the cause of instability is provided in Wasserstein GAN (WGAN), and wasserstein distance is proposed to stablize the training. Though WGAN is indeed more stable than previous GANs, it takes much more iterations and time to train. This is because the ways to ensure Lipschitz condition in WGAN (such as weight-clipping) significantly limit the capacity of the network. In this paper, we argue that it is beneficial to ensure Lipschitz condition as well as maintain sufficient capacity and expressiveness of the network. To facilitate this, we develop both theoretical and practical building blocks, using which one can construct different neural networks using a large range of metrics, as well as ensure Lipschitz condition and sufficient capacity of the networks. Using the proposed building blocks, and a special choice of a metric called Dudley metric, we propose Dudley GAN that outperforms the state of the arts in both convergence and sample quality. We discover a natural link between Dudley GAN (and its extension) and empirical risk minimization, which gives rise to generalization analysis.", "keywords": "GAN;Lipschitz neural network", "primary_area": "", "supplementary_material": "", "author": "Ehsan Abbasnejad;Javen Shi;Anton van den Hengel", "authorids": "ehsan.abbasnejad@adelaide.edu.au;javen.shi@adelaide.edu.au;anton.vandenhengel@adelaide.edu.au", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nabbasnejad2018deep,\ntitle={Deep Lipschitz networks and Dudley {GAN}s},\nauthor={Ehsan Abbasnejad and Javen Shi and Anton van den Hengel},\nyear={2018},\nurl={https://openreview.net/forum?id=rkw-jlb0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rkw-jlb0W", "pdf_size": 0, "rating": "5;5;8", "confidence": "3;1;4", "rating_avg": 6.0, "confidence_avg": 2.6666666666666665, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.7559289460184546, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18235088566137937627&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "rkxY-sl0W", "title": "Tree-to-tree Neural Networks for Program Translation", "track": "main", "status": "Workshop", "tldr": "", "abstract": "Program translation is an important tool to migrate legacy code in one language into an ecosystem built in a different language. In this work, we are the first to consider employing deep neural networks toward tackling this problem. We observe that program translation is a modular procedure, in which a sub-tree of the source tree is translated into the corresponding target sub-tree at each step. To capture this intuition, we design a tree-to-tree neural network as an encoder-decoder architecture to translate a source tree into a target one. Meanwhile, we develop an attention mechanism for the tree-to-tree model, so that when the decoder expands one non-terminal in the target tree, the attention mechanism locates the corresponding sub-tree in the source tree to guide the expansion of the decoder. We evaluate the program translation capability of our tree-to-tree model against several state-of-the-art approaches. Compared against other neural translation models, we observe that our approach is consistently better than the baselines with a margin of up to 15 points. Further, our approach can improve the previous state-of-the-art program translation approaches by a margin of 20 points on the translation of real-world projects.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinyun Chen;Chang Liu;Dawn Song", "authorids": "xinyun.chen@berkeley.edu;liuchang@eecs.berkeley.edu;dawnsong.travel@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nchen2018treetotree,\ntitle={Tree-to-tree Neural Networks for Program Translation},\nauthor={Xinyun Chen and Chang Liu and Dawn Song},\nyear={2018},\nurl={https://openreview.net/forum?id=rkxY-sl0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rkxY-sl0W", "pdf_size": 0, "rating": "4;4;6", "confidence": "3;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 342, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5077386959127255183&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8 }, { "title": "Variational Network Quantization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/131", "id": "ry-TW-WAb", "author_site": "Jan Achterhold, Jan Koehler, Anke Schmeink, Tim Genewein", "tldr": "We quantize and prune neural network weights using variational Bayesian inference with a multi-modal, sparsity inducing prior.", "abstract": "In this paper, the preparation of a neural network for pruning and few-bit quantization is formulated as a variational inference problem. To this end, a quantizing prior that leads to a multi-modal, sparse posterior distribution over weights, is introduced and a differentiable Kullback-Leibler divergence approximation for this prior is derived. After training with Variational Network Quantization, weights can be replaced by deterministic quantization values with small to negligible loss of task accuracy (including pruning by setting weights to 0). The method does not require fine-tuning after quantization. Results are shown for ternary quantization on LeNet-5 (MNIST) and DenseNet (CIFAR-10).", "keywords": "Network compression;variational inferene;ternary network;Bayesian neural network;weight quantization;weight sharing", "primary_area": "", "supplementary_material": "", "author": "Jan Achterhold;Jan Mathias Koehler;Anke Schmeink;Tim Genewein", "authorids": "mail@janachterhold.de;jan.koehler@de.bosch.com;anke.schmeink@rwth-aachen.de;tim.genewein@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nachterhold2018variational,\ntitle={Variational Network Quantization},\nauthor={Jan Achterhold and Jan Mathias Koehler and Anke Schmeink and Tim Genewein},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ry-TW-WAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;3;5", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 148, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6940038466399805898&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=ry-TW-WAb", "pdf": "https://openreview.net/pdf?id=ry-TW-WAb", "email": ";;;", "author_num": 4 }, { "title": "Deep Active Learning for Named Entity Recognition", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/125", "id": "ry018WZAZ", "author_site": "Yanyao Shen, Hyokun Yun, Zachary Lipton, Yakov Kronrod, anima anandkumar", "tldr": "We introduce a lightweight architecture for named entity recognition and carry out incremental active learning, which is able to match state-of-the-art performance with just 25% of the original training data.", "abstract": "Deep learning has yielded state-of-the-art performance on many natural language processing tasks including named entity recognition (NER). However, this typically requires large amounts of labeled data. In this work, we demonstrate that the amount of labeled training data can be drastically reduced when deep learning is combined with active learning. While active learning is sample-efficient, it can be computationally expensive since it requires iterative retraining. To speed this up, we introduce a lightweight architecture for NER, viz., the CNN-CNN-LSTM model consisting of convolutional character and word encoders and a long short term memory (LSTM) tag decoder. The model achieves nearly state-of-the-art performance on standard datasets for the task while being computationally much more efficient than best performing models. We carry out incremental active learning, during the training process, and are able to nearly match state-of-the-art performance with just 25\\% of the original training data.", "keywords": "active learning;deep learning;named entity recognition", "primary_area": "", "supplementary_material": "", "author": "Yanyao Shen;Hyokun Yun;Zachary C. Lipton;Yakov Kronrod;Animashree Anandkumar", "authorids": "shenyanyao@utexas.edu;yunhyoku@amazon.com;zlipton@cmu.edu;kronrod@amazon.com;animakumar@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nshen2018deep,\ntitle={Deep Active Learning for Named Entity Recognition},\nauthor={Yanyao Shen and Hyokun Yun and Zachary C. Lipton and Yakov Kronrod and Animashree Anandkumar},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ry018WZAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;3;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 5, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 596, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11713670878546571578&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 16, "openreview": "https://openreview.net/forum?id=ry018WZAZ", "pdf": "https://openreview.net/pdf?id=ry018WZAZ", "email": ";;;;", "author_num": 5 }, { "id": "ry0WOxbRZ", "title": "IVE-GAN: Invariant Encoding Generative Adversarial Networks", "track": "main", "status": "Reject", "tldr": "A noval GAN framework that utilizes transformation-invariant features to learn rich representations and strong generators.", "abstract": "Generative adversarial networks (GANs) are a powerful framework for generative tasks. However, they are difficult to train and tend to miss modes of the true data generation process. Although GANs can learn a rich representation of the covered modes of the data in their latent space, the framework misses an inverse mapping from data to this latent space. We propose Invariant Encoding Generative Adversarial Networks (IVE-GANs), a novel GAN framework that introduces such a mapping for individual samples from the data by utilizing features in the data which are invariant to certain transformations. Since the model maps individual samples to the latent space, it naturally encourages the generator to cover all modes. We demonstrate the effectiveness of our approach in terms of generative performance and learning rich representations on several datasets including common benchmark image generation tasks.", "keywords": "Deep learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Robin Winter;Djork-Arn\u00e8 Clevert", "authorids": "robin.winter@bayer.com;djork-arne.clevert@bayer.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nwinter2018ivegan,\ntitle={{IVE}-{GAN}: Invariant Encoding Generative Adversarial Networks},\nauthor={Robin Winter and Djork-Arn\u00e8 Clevert},\nyear={2018},\nurl={https://openreview.net/forum?id=ry0WOxbRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ry0WOxbRZ", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 6, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:0YZzjK_0dgUJ:scholar.google.com/&scioq=IVE-GAN:+Invariant+Encoding+Generative+Adversarial+Networks&hl=en&as_sdt=0,5", "gs_version_total": 4 }, { "title": "DORA The Explorer: Directed Outreaching Reinforcement Action-Selection", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/261", "id": "ry1arUgCW", "author_site": "Lior Fox, Leshem Choshen, Yonatan Loewenstein", "tldr": "We propose a generalization of visit-counters that evaluate the propagating exploratory value over trajectories, enabling efficient exploration for model-free RL", "abstract": "Exploration is a fundamental aspect of Reinforcement Learning, typically implemented using stochastic action-selection. Exploration, however, can be more efficient if directed toward gaining new world knowledge. Visit-counters have been proven useful both in practice and in theory for directed exploration. However, a major limitation of counters is their locality. While there are a few model-based solutions to this shortcoming, a model-free approach is still missing.\nWe propose $E$-values, a generalization of counters that can be used to evaluate the propagating exploratory value over state-action trajectories. We compare our approach to commonly used RL techniques, and show that using $E$-values improves learning and performance over traditional counters. We also show how our method can be implemented with function approximation to efficiently learn continuous MDPs. We demonstrate this by showing that our approach surpasses state of the art performance in the Freeway Atari 2600 game.", "keywords": "Reinforcement Learning;Exploration;Model-Free", "primary_area": "", "supplementary_material": "", "author": "Lior Fox;Leshem Choshen;Yonatan Loewenstein", "authorids": "lior.fox@mail.huji.ac.il;leshem.choshen@mail.huji.ac.il;yonatan.loewenstein@mail.huji.ac.il", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nfox2018dora,\ntitle={{DORA} The Explorer: Directed Outreaching Reinforcement Action-Selection},\nauthor={Lior Fox and Leshem Choshen and Yonatan Loewenstein},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ry1arUgCW},\n}", "github": "[![github](/images/github_icon.svg) borgr/DORA](https://github.com/borgr/DORA)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10658112327839471119&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=ry1arUgCW", "pdf": "https://openreview.net/pdf?id=ry1arUgCW", "email": ";;", "author_num": 3 }, { "id": "ry4S90l0b", "title": "A Self-Training Method for Semi-Supervised GANs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Since the creation of Generative Adversarial Networks (GANs), much work has been done to improve their training stability, their generated image quality, their range of application but nearly none of them explored their self-training potential. Self-training has been used before the advent of deep learning in order to allow training on limited labelled training data and has shown impressive results in semi-supervised learning. In this work, we combine these two ideas and make GANs self-trainable for semi-supervised learning tasks by exploiting their infinite data generation potential. Results show that using even the simplest form of self-training yields an improvement. We also show results for a more complex self-training scheme that performs at least as well as the basic self-training scheme but with significantly less data augmentation. ", "keywords": "self-training;generative adversarial networks;semi-supervised", "primary_area": "", "supplementary_material": "", "author": "Alan Do-Omri;Dalei Wu;Xiaohua Liu", "authorids": "alan.do-omri@mail.mcgill.ca;daleiwu@gmail.com;liuxiaohua3@huawei.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ndo-omri2018a,\ntitle={A Self-Training Method for Semi-Supervised {GAN}s},\nauthor={Alan Do-Omri and Dalei Wu and Xiaohua Liu},\nyear={2018},\nurl={https://openreview.net/forum?id=ry4S90l0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ry4S90l0b", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;5;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8231642238629842379&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0 }, { "id": "ry4SNTe0-", "title": "Improve Training Stability of Semi-supervised Generative Adversarial Networks with Collaborative Training", "track": "main", "status": "Reject", "tldr": "Improve Training Stability of Semi-supervised Generative Adversarial Networks with Collaborative Training", "abstract": "Improved generative adversarial network (Improved GAN) is a successful method of using generative adversarial models to solve the problem of semi-supervised learning. However, it suffers from the problem of unstable training. In this paper, we found that the instability is mostly due to the vanishing gradients on the generator. To remedy this issue, we propose a new method to use collaborative training to improve the stability of semi-supervised GAN with the combination of Wasserstein GAN. The experiments have shown that our proposed method is more stable than the original Improved GAN and achieves comparable classification accuracy on different data sets. ", "keywords": "generative adversarial training;semi-supervised training;collaborative training", "primary_area": "", "supplementary_material": "", "author": "Dalei Wu;Xiaohua Liu", "authorids": "daleiwu@gmail.com;", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nwu2018improve,\ntitle={Improve Training Stability of Semi-supervised Generative Adversarial Networks with Collaborative Training},\nauthor={Dalei Wu and Xiaohua Liu},\nyear={2018},\nurl={https://openreview.net/forum?id=ry4SNTe0-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ry4SNTe0-", "pdf_size": 0, "rating": "2;3;3", "confidence": "4;4;5", "rating_avg": 2.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14667402901944031930&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "ry5wc1bCW", "title": "Causal Generative Neural Networks", "track": "main", "status": "Reject", "tldr": "Discover the structure of functional causal models with generative neural networks", "abstract": "We introduce CGNN, a framework to learn functional causal models as generative neural networks. These networks are trained using backpropagation to minimize the maximum mean discrepancy to the observed data. Unlike previous approaches, CGNN leverages both conditional independences and distributional asymmetries to seamlessly discover bivariate and multivariate \n causal structures, with or without hidden variables. CGNN does not only estimate the causal structure, but a full and differentiable generative model of the data. Throughout an extensive variety of experiments, we illustrate the competitive esults of CGNN w.r.t state-of-the-art alternatives in observational causal discovery on both simulated and real data, in the tasks of cause-effect inference, v-structure identification, and multivariate causal discovery. ", "keywords": "Causal structure discovery;Generative neural networks;Cause-effect pair problem;Functional causal model;Maximum Mean Discrepancy;Structural Equation Models", "primary_area": "", "supplementary_material": "", "author": "Olivier Goudet;Diviyan Kalainathan;David Lopez-Paz;Philippe Caillou;Isabelle Guyon;Mich\u00e8le Sebag", "authorids": "olivier.goudet@lri.fr;diviyan.kalainathan@lri.fr;dlp@fb.com;philippe.caillou@lri.fr;isabelle.guyon@chalearn.org;michele.sebag@lri.fr", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\ngoudet2018causal,\ntitle={Causal Generative Neural Networks},\nauthor={Olivier Goudet and Diviyan Kalainathan and David Lopez-Paz and Philippe Caillou and Isabelle Guyon and Mich\u00e8le Sebag},\nyear={2018},\nurl={https://openreview.net/forum?id=ry5wc1bCW},\n}", "github": "[![github](/images/github_icon.svg) GoudetOlivier/CGNN](https://github.com/GoudetOlivier/CGNN)", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=ry5wc1bCW", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 2, "authors#_avg": 6, "corr_rating_confidence": 0, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8656026150030171842&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "title": "Active Neural Localization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/319", "id": "ry6-G_66b", "author_site": "Devendra Singh Chaplot, Emilio Parisotto, Ruslan Salakhutdinov", "tldr": "\"Active Neural Localizer\", a fully differentiable neural network that learns to localize efficiently using deep reinforcement learning.", "abstract": "Localization is the problem of estimating the location of an autonomous agent from an observation and a map of the environment. Traditional methods of localization, which filter the belief based on the observations, are sub-optimal in the number of steps required, as they do not decide the actions taken by the agent. We propose \"Active Neural Localizer\", a fully differentiable neural network that learns to localize efficiently. The proposed model incorporates ideas of traditional filtering-based localization methods, by using a structured belief of the state with multiplicative interactions to propagate belief, and combines it with a policy model to minimize the number of steps required for localization. Active Neural Localizer is trained end-to-end with reinforcement learning. We use a variety of simulation environments for our experiments which include random 2D mazes, random mazes in the Doom game engine and a photo-realistic environment in the Unreal game engine. The results on the 2D environments show the effectiveness of the learned policy in an idealistic setting while results on the 3D environments demonstrate the model's capability of learning the policy and perceptual model jointly from raw-pixel based RGB observations. We also show that a model trained on random textures in the Doom environment generalizes well to a photo-realistic office space environment in the Unreal engine. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Devendra Singh Chaplot;Emilio Parisotto;Ruslan Salakhutdinov", "authorids": "chaplot@cs.cmu.edu;eparisot@andrew.cmu.edu;rsalakhu@cs.cmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nsingh2018active,\ntitle={Active Neural Localization},\nauthor={Devendra Singh Chaplot and Emilio Parisotto and Ruslan Salakhutdinov},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ry6-G_66b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;4;5", "rating_avg": 7.0, "confidence_avg": 4.333333333333333, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 121, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15441033474662585053&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=ry6-G_66b", "pdf": "https://openreview.net/pdf?id=ry6-G_66b", "email": ";;", "author_num": 3 }, { "title": "Hierarchical Subtask Discovery with Non-Negative Matrix Factorization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/56", "id": "ry80wMW0W", "author_site": "Adam Earle, Andrew Saxe, Benjamin Rosman", "tldr": "We present a novel algorithm for hierarchical subtask discovery which leverages the multitask linear Markov decision process framework.", "abstract": "Hierarchical reinforcement learning methods offer a powerful means of planning flexible behavior in complicated domains. However, learning an appropriate hierarchical decomposition of a domain into subtasks remains a substantial challenge. We present a novel algorithm for subtask discovery, based on the recently introduced multitask linearly-solvable Markov decision process (MLMDP) framework. The MLMDP can perform never-before-seen tasks by representing them as a linear combination of a previously learned basis set of tasks. In this setting, the subtask discovery problem can naturally be posed as finding an optimal low-rank approximation of the set of tasks the agent will face in a domain. We use non-negative matrix factorization to discover this minimal basis set of tasks, and show that the technique learns intuitive decompositions in a variety of domains. Our method has several qualitatively desirable features: it is not limited to learning subtasks with single goal states, instead learning distributed patterns of preferred states; it learns qualitatively different hierarchical decompositions in the same domain depending on the ensemble of tasks the agent will face; and it may be straightforwardly iterated to obtain deeper hierarchical decompositions.", "keywords": "Reinforcement Learning;Hierarchy;Subtask Discovery;Linear Markov Decision Process", "primary_area": "", "supplementary_material": "", "author": "Adam C. Earle;Andrew M. Saxe;Benjamin Rosman", "authorids": "adam.earle@ymail.com;asaxe@fas.harvard.edu;benjros@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nc.2018hierarchical,\ntitle={Hierarchical Subtask Discovery with Non-Negative Matrix Factorization},\nauthor={Adam C. Earle and Andrew M. Saxe and Benjamin Rosman},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ry80wMW0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer5", "pdf_size": 0, "rating": "5;6;7", "confidence": "2;2;3", "rating_avg": 6.0, "confidence_avg": 2.3333333333333335, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7464922712154923612&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 16, "openreview": "https://openreview.net/forum?id=ry80wMW0W", "pdf": "https://openreview.net/pdf?id=ry80wMW0W", "email": ";;", "author_num": 3 }, { "id": "ry831QWAb", "title": "BLOCK-NORMALIZED GRADIENT METHOD: AN EMPIRICAL STUDY FOR TRAINING DEEP NEURAL NETWORK", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we propose a generic and simple strategy for utilizing stochastic gradient information in optimization. The technique essentially contains two consecutive steps in each iteration: 1) computing and normalizing each block (layer) of the mini-batch stochastic gradient; 2) selecting appropriate step size to update the decision variable (parameter) towards the negative of the block-normalized gradient. We conduct extensive empirical studies on various non-convex neural network optimization problems, including multilayer perceptron, convolution neural networks and recurrent neural networks. The results indicate the block-normalized gradient can help accelerate the training of neural networks. In particular,\nwe observe that the normalized gradient methods having constant step size with occasionally decay, such as SGD with momentum, have better performance in the deep convolution neural networks, while those with adaptive step sizes, such as Adam, perform better in recurrent neural networks. Besides, we also observe this line of methods can lead to solutions with better generalization properties, which is confirmed by the performance improvement over strong baselines. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Adams Wei Yu;Lei Huang;Qihang Lin;Ruslan Salakhutdinov;Jaime Carbonell", "authorids": "weiyu@cs.cmu.edu;huanglei@nlsde.buaa.edu.cn;qihang-lin@uiowa.edu;rsalakhu@cs.cmu.edu;jgc@cs.cmu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nwei2018blocknormalized,\ntitle={{BLOCK}-{NORMALIZED} {GRADIENT} {METHOD}: {AN} {EMPIRICAL} {STUDY} {FOR} {TRAINING} {DEEP} {NEURAL} {NETWORK}},\nauthor={Adams Wei Yu and Lei Huang and Qihang Lin and Ruslan Salakhutdinov and Jaime Carbonell},\nyear={2018},\nurl={https://openreview.net/forum?id=ry831QWAb},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=ry831QWAb)", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ry831QWAb", "pdf_size": 0, "rating": "2;4;9", "confidence": "5;5;5", "rating_avg": 5.0, "confidence_avg": 5.0, "replies_avg": 9, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16091074646466946304&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Routing Networks: Adaptive Selection of Non-Linear Functions for Multi-Task Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/58", "id": "ry8dvM-R-", "author_site": "Clemens Rosenbaum, Tim Klinger, Matt Riemer", "tldr": "routing networks: a new kind of neural network which learns to adaptively route its input for multi-task learning", "abstract": "Multi-task learning (MTL) with neural networks leverages commonalities in tasks to improve performance, but often suffers from task interference which reduces the benefits of transfer. To address this issue we introduce the routing network paradigm, a novel neural network and training algorithm. A routing network is a kind of self-organizing neural network consisting of two components: a router and a set of one or more function blocks. A function block may be any neural network \u2013 for example a fully-connected or a convolutional layer. Given an input the router makes a routing decision, choosing a function block to apply and passing the output back to the router recursively, terminating when a fixed recursion depth is reached. In this way the routing network dynamically composes different function blocks for each input. We employ a collaborative multi-agent reinforcement learning (MARL) approach to jointly train the router and function blocks. We evaluate our model against cross-stitch networks and shared-layer baselines on multi-task settings of the MNIST, mini-imagenet, and CIFAR-100 datasets. Our experiments demonstrate a significant improvement in accuracy, with sharper convergence. In addition, routing networks have nearly constant per-task training cost while cross-stitch networks scale linearly with the number of tasks. On CIFAR100 (20 tasks) we obtain cross-stitch performance levels with an 85% average reduction in training time.\n", "keywords": "multi-task;transfer;routing;marl;multi-agent;reinforcement;self-organizing", "primary_area": "", "supplementary_material": "", "author": "Clemens Rosenbaum;Tim Klinger;Matthew Riemer", "authorids": "crosenbaum@umass.edu;tklinger@us.ibm.com;mdriemer@us.ibm.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nrosenbaum2018routing,\ntitle={Routing Networks: Adaptive Selection of Non-Linear Functions for Multi-Task Learning},\nauthor={Clemens Rosenbaum and Tim Klinger and Matthew Riemer},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ry8dvM-R-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;3;4", "rating_avg": 7.0, "confidence_avg": 3.3333333333333335, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 302, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15618267721508481898&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=ry8dvM-R-", "pdf": "https://openreview.net/pdf?id=ry8dvM-R-", "email": ";;", "author_num": 3 }, { "id": "ry9tUX_6-", "title": "Entropy-SGD optimizes the prior of a PAC-Bayes bound: Data-dependent PAC-Bayes priors via differential privacy", "track": "main", "status": "Reject", "tldr": "We show that Entropy-SGD optimizes the prior of a PAC-Bayes bound, violating the requirement that the prior be independent of data; we use differential privacy to resolve this and improve generalization.", "abstract": "We show that Entropy-SGD (Chaudhari et al., 2017), when viewed as a learning algorithm, optimizes a PAC-Bayes bound on the risk of a Gibbs (posterior) classifier, i.e., a randomized classifier obtained by a risk-sensitive perturbation of the weights of a learned classifier. Entropy-SGD works by optimizing the bound\u2019s prior, violating the hypothesis of the PAC-Bayes theorem that the prior is chosen independently of the data. Indeed, available implementations of Entropy-SGD rapidly obtain zero training error on random labels and the same holds of the Gibbs posterior. In order to obtain a valid generalization bound, we show that an \u03b5-differentially private prior yields a valid PAC-Bayes bound, a straightforward consequence of results connecting generalization with differential privacy. Using stochastic gradient Langevin dynamics (SGLD) to approximate the well-known exponential release mechanism, we observe that generalization error on MNIST (measured on held out data) falls within the (empirically nonvacuous) bounds computed under the assumption that SGLD produces perfect samples. In particular, Entropy-SGLD can be configured to yield relatively tight generalization bounds and still fit real labels, although these same settings do not obtain state-of-the-art performance.", "keywords": "generalization error;neural networks;statistical learning theory;PAC-Bayes theory", "primary_area": "", "supplementary_material": "", "author": "Gintare Karolina Dziugaite;Daniel M. Roy", "authorids": "gkd22@cam.ac.uk;droy@utstat.toronto.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkarolina2018entropysgd,\ntitle={Entropy-{SGD} optimizes the prior of a {PAC}-Bayes bound: Data-dependent {PAC}-Bayes priors via differential privacy},\nauthor={Gintare Karolina Dziugaite and Daniel M. Roy},\nyear={2018},\nurl={https://openreview.net/forum?id=ry9tUX_6-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ry9tUX_6-", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;3;3", "rating_avg": 6.0, "confidence_avg": 3.0, "replies_avg": 17, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6629711064209296308&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "ryA-jdlA-", "title": "A closer look at the word analogy problem", "track": "main", "status": "Reject", "tldr": "Simple generative approach to solve the word analogy problem which yields insights into word relationships, and the problems with estimating them", "abstract": "Although word analogy problems have become a standard tool for evaluating word vectors, little is known about why word vectors are so good at solving these problems. In this paper, I attempt to further our understanding of the subject, by developing a simple, but highly accurate generative approach to solve the word analogy problem for the case when all terms involved in the problem are nouns. My results demonstrate the ambiguities associated with learning the relationship between a word pair, and the role of the training dataset in determining the relationship which gets most highlighted. Furthermore, my results show that the ability of a model to accurately solve the word analogy problem may not be indicative of a model\u2019s ability to learn the relationship between a word pair the way a human does.\n", "keywords": "word2vec;glove;word analogy;word relationships;word vectors", "primary_area": "", "supplementary_material": "", "author": "Siddharth Krishna Kumar", "authorids": "siddharthkumar@upwork.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nkrishna2018a,\ntitle={A closer look at the word analogy problem},\nauthor={Siddharth Krishna Kumar},\nyear={2018},\nurl={https://openreview.net/forum?id=ryA-jdlA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ryA-jdlA-", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 1, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lHlJp3vUqaQJ:scholar.google.com/&scioq=A+closer+look+at+the+word+analogy+problem&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "ryALZdAT-", "title": "Feature Incay for Representation Regularization", "track": "main", "status": "Workshop", "tldr": "", "abstract": "Softmax-based loss is widely used in deep learning for multi-class classification, where each class is represented by a weight vector and each sample is represented as a feature vector. Different from traditional learning algorithms where features are pre-defined and only weight vectors are tunable through training, feature vectors are also tunable as representation learning in deep learning. Thus we investigate how to improve the classification performance by better adjusting the features. One main observation is that elongating the feature norm of both correctly-classified and mis-classified feature vectors improves learning: (1) increasing the feature norm of correctly-classified examples induce smaller training loss; (2) increasing the feature norm of mis-classified examples can upweight the contribution from hard examples. Accordingly, we propose feature incay to regularize representation learning by encouraging larger feature norm. In contrast to weight decay which shrinks the weight norm, feature incay is proposed to stretch the feature norm. Extensive empirical results on MNIST, CIFAR10, CIFAR100 and LFW demonstrate the effectiveness of feature incay. ", "keywords": "feature norm;regularization;softmax loss;feature incay", "primary_area": "", "supplementary_material": "", "author": "Yuhui Yuan;Kuiyuan Yang;Jianyuan Guo;Jingdong Wang;Chao Zhang", "authorids": "yuyua@microsoft.com;kuiyuanyang@deepmotion.ai;1701214082@pku.edu.cn;jingdw@microsoft.com;chzhang@cis.pku.edu.cn", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nyuan2018feature,\ntitle={Feature Incay for Representation Regularization},\nauthor={Yuhui Yuan and Kuiyuan Yang and Jianyuan Guo and Jingdong Wang and Chao Zhang},\nyear={2018},\nurl={https://openreview.net/forum?id=ryALZdAT-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ryALZdAT-", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;2;4", "rating_avg": 6.0, "confidence_avg": 3.0, "replies_avg": 8, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5598503891155965069&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "title": "Predicting Floor-Level for 911 Calls with Neural Networks and Smartphone Sensor Data", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/123", "id": "ryBnUWb0b", "author_site": "William A Falcon, Henning Schulzrinne", "tldr": "We used an LSTM to detect when a smartphone walks into a building. Then we predict the device's floor level using data from sensors aboard the smartphone.", "abstract": "In cities with tall buildings, emergency responders need an accurate floor level location to find 911 callers quickly. We introduce a system to estimate a victim's floor level via their mobile device's sensor data in a two-step process. First, we train a neural network to determine when a smartphone enters or exits a building via GPS signal changes. Second, we use a barometer equipped smartphone to measure the change in barometric pressure from the entrance of the building to the victim's indoor location. Unlike impractical previous approaches, our system is the first that does not require the use of beacons, prior knowledge of the building infrastructure, or knowledge of user behavior. We demonstrate real-world feasibility through 63 experiments across five different tall buildings throughout New York City where our system predicted the correct floor level with 100% accuracy.\n", "keywords": "Recurrent Neural Networks;RNN;LSTM;Mobile Device;Sensors", "primary_area": "", "supplementary_material": "", "author": "William Falcon;Henning Schulzrinne", "authorids": "waf2107@columbia.edu;hgs@cs.columbia.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nfalcon2018predicting,\ntitle={Predicting Floor-Level for 911 Calls with Neural Networks and Smartphone Sensor Data},\nauthor={William Falcon and Henning Schulzrinne},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ryBnUWb0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;3;5", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 19, "authors#_avg": 2, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11742683893022168141&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=ryBnUWb0b", "pdf": "https://openreview.net/pdf?id=ryBnUWb0b", "email": ";", "author_num": 2 }, { "id": "ryCM8zWRb", "title": "Recurrent Neural Networks with Top-k Gains for Session-based Recommendations", "track": "main", "status": "Reject", "tldr": "Improving session-based recommendations with RNNs (GRU4Rec) by 35% using newly designed loss functions and sampling.", "abstract": "RNNs have been shown to be excellent models for sequential data and in particular for session-based user behavior. The use of RNNs provides impressive performance benefits over classical methods in session-based recommendations. In this work we introduce a novel ranking loss function tailored for RNNs in recommendation settings. The better performance of such loss over alternatives, along with further tricks and improvements described in this work, allow to achieve an overall improvement of up to 35% in terms of MRR and Recall@20 over previous session-based RNN solutions and up to 51% over classical collaborative filtering approaches. Unlike data augmentation-based improvements, our method does not increase training times significantly.", "keywords": "gru4rec;session-based recommendations;recommender systems;recurrent neural network", "primary_area": "", "supplementary_material": "", "author": "Bal\u00e1zs Hidasi;Alexandros Karatzoglou", "authorids": "hidasib@gmail.com;alexk@tid.es", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nhidasi2018recurrent,\ntitle={Recurrent Neural Networks with Top-k Gains for Session-based Recommendations},\nauthor={Bal\u00e1zs Hidasi and Alexandros Karatzoglou},\nyear={2018},\nurl={https://openreview.net/forum?id=ryCM8zWRb},\n}", "github": "[![github](/images/github_icon.svg) hidasib/GRU4Rec](https://github.com/hidasib/GRU4Rec) + [![Papers with Code](/images/pwc_icon.svg) 10 community implementations](https://paperswithcode.com/paper/?openreview=ryCM8zWRb)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ryCM8zWRb", "pdf_size": 0, "rating": "4;6;8", "confidence": "5;5;4", "rating_avg": 6.0, "confidence_avg": 4.666666666666667, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1035, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13823767675348486329&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10 }, { "id": "ryDNZZZAW", "title": "Multiple Source Domain Adaptation with Adversarial Learning", "track": "main", "status": "Workshop", "tldr": "", "abstract": "While domain adaptation has been actively researched in recent years, most theoretical results and algorithms focus on the single-source-single-target adaptation setting. Naive application of such algorithms on multiple source domain adaptation problem may lead to suboptimal solutions. We propose a new generalization bound for domain adaptation when there are multiple source domains with labeled instances and one target domain with unlabeled instances. Compared with existing bounds, the new bound does not require expert knowledge about the target distribution, nor the optimal combination rule for multisource domains. Interestingly, our theory also leads to an efficient learning strategy using adversarial neural networks: we show how to interpret it as learning feature representations that are invariant to the multiple domain shifts while still being discriminative for the learning task. To this end, we propose two models, both of which we call multisource domain adversarial networks (MDANs): the first model optimizes directly our bound, while the second model is a smoothed approximation of the first one, leading to a more data-efficient and task-adaptive model. The optimization tasks of both models are minimax saddle point problems that can be optimized by adversarial training. To demonstrate the effectiveness of MDANs, we conduct extensive experiments showing superior adaptation performance on three real-world datasets: sentiment analysis, digit classification, and vehicle counting. \n", "keywords": "adversarial learning;domain adaptation", "primary_area": "", "supplementary_material": "", "author": "Han Zhao;Shanghang Zhang;Guanhang Wu;Jo\\~{a}o P. Costeira;Jos\\'{e} M. F. Moura;Geoffrey J. Gordon", "authorids": "han.zhao@cs.cmu.edu;shanghaz@andrew.cmu.edu;guanhanw@andrew.cmu.edu;jpc@isr.ist.utl.pt;moura@andrew.cmu.edu;ggordon@cs.cmu.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nzhao2018multiple,\ntitle={Multiple Source Domain Adaptation with Adversarial Learning},\nauthor={Han Zhao and Shanghang Zhang and Guanhang Wu and Jo\\~{a}o P. Costeira and Jos\\'{e} M. F. Moura and Geoffrey J. Gordon},\nyear={2018},\nurl={https://openreview.net/forum?id=ryDNZZZAW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ryDNZZZAW", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "4;5;5;3", "rating_avg": 6.0, "confidence_avg": 4.25, "replies_avg": 14, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 66, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1668472442399839002&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "ryEJWe2HM", "title": "Melody Generation for Pop Music via Word Representation of Musical Properties", "track": "main", "status": "Withdraw", "tldr": "We propose a novel model to represent notes and their properties, which can enhance the automatic melody generation.", "abstract": "Automatic melody generation for pop music has been a long-time aspiration for\nboth AI researchers and musicians. However, learning to generate euphonious\nmelody has turned out to be highly challenging due to a number of factors. Representation\nof multivariate property of notes has been one of the primary challenges.\nIt is also difficult to remain in the permissible spectrum of musical variety, outside\nof which would be perceived as a plain random play without auditory pleasantness.\nObserving the conventional structure of pop music poses further challenges.\nIn this paper, we propose to represent each note and its properties as a unique\n\u2018word,\u2019 thus lessening the prospect of misalignments between the properties, as\nwell as reducing the complexity of learning. We also enforce regularization policies\non the range of notes, thus encouraging the generated melody to stay close\nto what humans would find easy to follow. Furthermore, we generate melody\nconditioned on song part information, thus replicating the overall structure of a\nfull song. Experimental results demonstrate that our model can generate auditorily\npleasant songs that are more indistinguishable from human-written ones than\nprevious models.", "keywords": "music;lstm;gan;generation;rnn;hmm", "primary_area": "", "supplementary_material": "", "author": "Andrew Shin;Leopold Crestel;Hiroharu Kato;Kuniaki Saito;Katsunori Ohnishi;Masataka Yamaguchi;Masahiro Nakawaki;Yoshitaka Ushiku;Tatsuya Harada", "authorids": "andrew@mi.t.u-tokyo.ac.jp;crestel@ircam.fr;kato@mi.t.u-tokyo.ac.jp;k-saito@mi.t.u-tokyo.ac.jp;ohnishi@mi.t.u-tokyo.ac.jp;yamaguchi@mi.t.u-tokyo.ac.jp;nakawaki.ici@gmail.com;ushiku@mi.t.u-tokyo.ac.jp;harada@mi.t.u-tokyo.ac.jp", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ryEJWe2HM", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;5;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 3, "authors#_avg": 9, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9945054724239659167&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "ryF-cQ6T-", "title": "Machine Learning by Two-Dimensional Hierarchical Tensor Networks: A Quantum Information Theoretic Perspective on Deep Architectures", "track": "main", "status": "Reject", "tldr": "This approach overcomes scalability issues and implies novel mathematical connections among quantum many-body physics, quantum information theory, and machine learning.", "abstract": "The resemblance between the methods used in studying quantum-many body physics and in machine learning has drawn considerable attention. In particular, tensor networks (TNs) and deep learning architectures bear striking similarities to the extent that TNs can be used for machine learning. Previous results used one-dimensional TNs in image recognition, showing limited scalability and a request of high bond dimension. In this work, we train two-dimensional hierarchical TNs to solve image recognition problems, using a training algorithm derived from the multipartite entanglement renormalization ansatz (MERA). This approach overcomes scalability issues and implies novel mathematical connections among quantum many-body physics, quantum information theory, and machine learning. While keeping the TN unitary in the training phase, TN states can be defined, which optimally encodes each class of the images into a quantum many-body state. We study the quantum features of the TN states, including quantum entanglement and fidelity. We suggest these quantities could be novel properties that characterize the image classes, as well as the machine learning tasks. Our work could be further applied to identifying possible quantum properties of certain artificial intelligence methods.", "keywords": "quantum machine learning;tensor network;quantum information", "primary_area": "", "supplementary_material": "", "author": "Ding Liu;Shi-Ju Ran;Peter Wittek;Cheng Peng;Raul Bl\u00e1zquez Garc\u00eda;Gang Su;Maciej Lewenstein", "authorids": "dingliu_thu@126.com;shi-ju.ran@icfo.eu;peter.wittek@icfo.eu;pengcheng12@mails.ucas.ac.cn;raulbzga@gmail.com;gsu@ucas.ac.cn;maciej.lewenstein@icfo.eu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nliu2018machine,\ntitle={Machine Learning by Two-Dimensional Hierarchical Tensor Networks: A Quantum Information Theoretic Perspective on Deep Architectures},\nauthor={Ding Liu and Shi-Ju Ran and Peter Wittek and Cheng Peng and Raul Bl\u00e1zquez Garc\u00eda and Gang Su and Maciej Lewenstein},\nyear={2018},\nurl={https://openreview.net/forum?id=ryF-cQ6T-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ryF-cQ6T-", "pdf_size": 0, "rating": "3;4;6", "confidence": "2;3;3", "rating_avg": 4.333333333333333, "confidence_avg": 2.6666666666666665, "replies_avg": 7, "authors#_avg": 7, "corr_rating_confidence": 0.7559289460184545, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2638985552472084132&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "ryG6xZ-RZ", "title": "DLVM: A modern compiler infrastructure for deep learning systems", "track": "main", "status": "Workshop", "tldr": "We introduce a novel compiler infrastructure that addresses shortcomings of existing deep learning frameworks.", "abstract": "Deep learning software demands reliability and performance. However, many of the existing deep learning frameworks are software libraries that act as an unsafe DSL in Python and a computation graph interpreter. We present DLVM, a design and implementation of a compiler infrastructure with a linear algebra intermediate representation, algorithmic differentiation by adjoint code generation, domain- specific optimizations and a code generator targeting GPU via LLVM. Designed as a modern compiler infrastructure inspired by LLVM, DLVM is more modular and more generic than existing deep learning compiler frameworks, and supports tensor DSLs with high expressivity. With our prototypical staged DSL embedded in Swift, we argue that the DLVM system enables a form of modular, safe and performant frameworks for deep learning.", "keywords": "deep learning;automatic differentiation;algorithmic differentiation;domain specific languages;neural networks;programming languages;DSLs", "primary_area": "", "supplementary_material": "", "author": "Richard Wei;Lane Schwartz;Vikram Adve", "authorids": "xwei12@illinois.edu;lanes@illinois.edu;vadve@illinois.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nwei2018dlvm,\ntitle={{DLVM}: A modern compiler infrastructure for deep learning systems},\nauthor={Richard Wei and Lane Schwartz and Vikram Adve},\nyear={2018},\nurl={https://openreview.net/forum?id=ryG6xZ-RZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer5;AnonReviewer2", "site": "https://openreview.net/forum?id=ryG6xZ-RZ", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;3;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16251965692390475803&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "title": "Relational Neural Expectation Maximization: Unsupervised Discovery of Objects and their Interactions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/15", "id": "ryH20GbRW", "author_site": "Sjoerd van Steenkiste, Michael Chang, Klaus Greff, J\u00fcrgen Schmidhuber", "tldr": "We introduce a novel approach to common-sense physical reasoning that learns to discover objects and model their physical interactions from raw visual images in a purely unsupervised fashion", "abstract": "Common-sense physical reasoning is an essential ingredient for any intelligent agent operating in the real-world. For example, it can be used to simulate the environment, or to infer the state of parts of the world that are currently unobserved. In order to match real-world conditions this causal knowledge must be learned without access to supervised data. To address this problem we present a novel method that learns to discover objects and model their physical interactions from raw visual images in a purely unsupervised fashion. It incorporates prior knowledge about the compositional nature of human perception to factor interactions between object-pairs and learn efficiently. On videos of bouncing balls we show the superior modelling capabilities of our method compared to other unsupervised neural approaches that do not incorporate such prior knowledge. We demonstrate its ability to handle occlusion and show that it can extrapolate learned knowledge to scenes with different numbers of objects.", "keywords": "Common-sense Physical Reasoning;Intuitive Physics;Representation Learning;Model building", "primary_area": "", "supplementary_material": "", "author": "Sjoerd van Steenkiste;Michael Chang;Klaus Greff;J\u00fcrgen Schmidhuber", "authorids": "sjoerd@idsia.ch;mbchang@berkeley.edu;klaus@idsia.ch;juergen@idsia.ch", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nvan2018relational,\ntitle={Relational Neural Expectation Maximization: Unsupervised Discovery of Objects and their Interactions},\nauthor={Sjoerd van Steenkiste and Michael Chang and Klaus Greff and J\u00fcrgen Schmidhuber},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ryH20GbRW},\n}", "github": "[![github](/images/github_icon.svg) sjoerdvansteenkiste/Relational-NEM](https://github.com/sjoerdvansteenkiste/Relational-NEM) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=ryH20GbRW)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;3;5", "rating_avg": 7.333333333333333, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 326, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11323622217846680222&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=ryH20GbRW", "pdf": "https://openreview.net/pdf?id=ryH20GbRW", "email": ";;;", "author_num": 4 }, { "id": "ryHM_fbA-", "title": "Learning Document Embeddings With CNNs", "track": "main", "status": "Reject", "tldr": "Convolutional neural network model for unsupervised document embedding.", "abstract": "This paper proposes a new model for document embedding. Existing approaches either require complex inference or use recurrent neural networks that are difficult to parallelize. We take a different route and use recent advances in language modeling to develop a convolutional neural network embedding model. This allows us to train deeper architectures that are fully parallelizable. Stacking layers together increases the receptive filed allowing each successive layer to model increasingly longer range semantic dependences within the document. Empirically we demonstrate superior results on two publicly available benchmarks. Full code will be released with the final version of this paper.", "keywords": "unsupervised embedding;convolutional neural network", "primary_area": "", "supplementary_material": "", "author": "Shunan Zhao;Chundi Lui;Maksims Volkovs", "authorids": "shunan@layer6.ai;chundi@layer6.ai;maksims.volkovs@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzhao2018learning,\ntitle={Learning Document Embeddings With {CNN}s},\nauthor={Shunan Zhao and Chundi Lui and Maksims Volkovs},\nyear={2018},\nurl={https://openreview.net/forum?id=ryHM_fbA-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ryHM_fbA-", "pdf_size": 0, "rating": "2;4;6", "confidence": "5;3;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": -0.5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6534240609735631862&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "ryH_bShhW", "title": "DOUBLY STOCHASTIC ADVERSARIAL AUTOENCODER", "track": "main", "status": "Reject", "tldr": "", "abstract": "Any autoencoder network can be turned into a generative model by imposing an arbitrary prior distribution on its hidden code vector. Variational Autoencoder uses a KL divergence penalty to impose the prior, whereas Adversarial Autoencoder uses generative adversarial networks. A straightforward modification of Adversarial Autoencoder can be achieved by replacing the adversarial network with maximum mean discrepancy (MMD) network. This replacement leads to a new set of probabilistic autoencoder which is also discussed in our paper.\n\nHowever, an essential challenge remains in both of these probabilistic autoencoders, namely that the only source of randomness at the output of encoder, is the training data itself. Lack of enough stochasticity can make the optimization problem non-trivial. As a result, they can lead to degenerate solutions where the generator collapses into sampling only a few modes.\n\nOur proposal is to replace the adversary of the adversarial autoencoder by a space of {\\it stochastic} functions. This replacement introduces a a new source of randomness which can be considered as a continuous control for encouraging {\\it explorations}. This prevents the adversary from fitting too closely to the generator and therefore leads to more diverse set of generated samples. Consequently, the decoder serves as a better generative network which unlike MMD nets scales linearly with the amount of data. We provide mathematical and empirical evidence on how this replacement outperforms the pre-existing architectures. ", "keywords": "Generative adversarial Networks;Deep Generative models;Kernel Methods", "primary_area": "", "supplementary_material": "", "author": "Mahdi Azarafrooz", "authorids": "mazarafrooz@cylance.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nazarafrooz2018doubly,\ntitle={{DOUBLY} {STOCHASTIC} {ADVERSARIAL} {AUTOENCODER}},\nauthor={Mahdi Azarafrooz},\nyear={2018},\nurl={https://openreview.net/forum?id=ryH_bShhW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ryH_bShhW", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;5;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.666666666666667, "replies_avg": 7, "authors#_avg": 1, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9071964703668418054&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "ryKRRsm0Z", "title": "Binarized Back-Propagation: Training Binarized Neural Networks with Binarized Gradients", "track": "main", "status": "Withdraw", "tldr": "Binarized Back-Propagation all you need for completely binarized training is to is to inflate the size of the network", "abstract": " Binarized Neural networks (BNNs) have been shown to be effective in improving network efficiency during the inference phase, after the network has been trained. However, BNNs only binarize the model parameters and activations during propagations. Therefore, BNNs do not offer significant efficiency improvements during training, since the gradients are still propagated and used with high precision. \n \n We show there is no inherent difficulty in training BNNs using \"Binarized BackPropagation\" (BBP), in which we also binarize the gradients. To avoid significant degradation in test accuracy, we simply increase the number of filter maps in a each convolution layer. Using BBP on dedicated hardware can potentially significantly improve the execution efficiency (\\emph{e.g.}, reduce dynamic memory footprint, memory bandwidth and computational energy) and speed up the training process with an appropriate hardware support, even after such an increase in network size. Moreover, our method is ideal for distributed learning as it reduces the communication costs significantly (e.g., by ~32). Using this method, we demonstrate a minimal loss in classification accuracy on several datasets and topologies.", "keywords": "Neural Network acceleration;Low Precision neural networks.", "primary_area": "", "supplementary_material": "", "author": "Itay Hubara;Elad Hoffer;Daniel Soudy", "authorids": "itayhubara@gmail.com;elad.hoffer@gmail.com;daniel.soudry@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=ryKRRsm0Z", "pdf_size": 0, "rating": "", "confidence": "", "rating_avg": 0, "confidence_avg": 0, "replies_avg": 0, "authors#_avg": 3, "corr_rating_confidence": 0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15940518733243094398&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "ryOBB6g-M", "title": "Spatial Variational Auto-Encoding via Matrix-Variate Normal Distributions", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The key idea of variational auto-encoders (VAEs) resembles that of traditional auto-encoder models in which spatial information is supposed to be explicitly encoded in the latent space. However, the latent variables in VAEs are vectors, which can be interpreted as multiple feature maps of size 1x1. Such representations can only convey spatial information implicitly when coupled with powerful decoders. In this work, we propose spatial VAEs that use feature maps of larger size as latent variables to explicitly capture spatial information. This is achieved by allowing the latent variables to be sampled from matrix-variate normal (MVN) distributions whose parameters are computed from the encoder network. To increase dependencies among locations on latent feature maps and reduce the number of parameters, we further propose spatial VAEs via low-rank MVN distributions. Experimental results show that the proposed spatial VAEs outperform original VAEs in capturing rich structural and spatial information.", "keywords": "Variational auto-encoder;unsupervised learning;image generation;spatial information;matrix-variate normal distribution", "primary_area": "", "supplementary_material": "", "author": "Zhengyang Wang;Hao Yuan;Shuiwang Ji", "authorids": "zwang6@eecs.wsu.edu;hao.yuan@wsu.edu;sji@eecs.wsu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ryOBB6g-M", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;4;2", "rating_avg": 4.0, "confidence_avg": 3.6666666666666665, "replies_avg": 3, "authors#_avg": 3, "corr_rating_confidence": -0.9819805060619659, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13922936018819397101&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "ryOG3fWCW", "title": "Model Specialization for Inference Via End-to-End Distillation, Pruning, and Cascades", "track": "main", "status": "Reject", "tldr": "", "abstract": "The availability of general-purpose reference and benchmark datasets such as\nImageNet have spurred the development of general-purpose popular reference\nmodel architectures and pre-trained weights. However, in practice, neural net-\nworks are often employed to perform specific, more restrictive tasks, that are\nnarrower in scope and complexity. Thus, simply fine-tuning or transfer learn-\ning from a general-purpose network inherits a large computational cost that may\nnot be necessary for a given task. In this work, we investigate the potential for\nmodel specialization, or reducing a model\u2019s computational footprint by leverag-\ning task-specific knowledge, such as a restricted inference distribution. We study\nthree methods for model specialization\u20141) task-aware distillation, 2) task-aware\npruning, and 3) specialized model cascades\u2014and evaluate their performance on\na range of classification tasks. Moreover, for the first time, we investigate how\nthese techniques complement one another, enabling up to 5\u00d7 speedups with no\nloss in accuracy and 9.8\u00d7 speedups while remaining within 2.5% of a highly ac-\ncurate ResNet on specialized image classification tasks. These results suggest that\nsimple and easy-to-implement specialization procedures may benefit a large num-\nber practical applications in which the representational power of general-purpose\nnetworks need not be inherited.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Daniel Kang;Karey Shi;Thao Ngyuen;Stephanie Mallard;Peter Bailis;Matei Zaharia", "authorids": ";kareyshi@stanford.edu;;;;", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nkang2018model,\ntitle={Model Specialization for Inference Via End-to-End Distillation, Pruning, and Cascades},\nauthor={Daniel Kang and Karey Shi and Thao Ngyuen and Stephanie Mallard and Peter Bailis and Matei Zaharia},\nyear={2018},\nurl={https://openreview.net/forum?id=ryOG3fWCW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ryOG3fWCW", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;4;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 6, "corr_rating_confidence": -0.944911182523068, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jh5vDYWMywAJ:scholar.google.com/&scioq=Model+Specialization+for+Inference+Via+End-to-End+Distillation,+Pruning,+and+Cascades&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "ryOegkTXf", "title": "Deep Active Learning over the Long Tail", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "This paper is concerned with pool-based active learning for deep neural networks. Motivated by coreset dataset compression ideas, we present a novel active learning algorithm that queries consecutive points from the pool using farthest-first traversals in the space of neural activation over a representation layer. We show consistent and overwhelming improvement in sample complexity over passive learning (random sampling) for three datasets: MNIST, Cifar-10, and Cifar-100. In addition, our algorithm outperforms the traditional uncertainty sampling technique (obtained using softmax activations), and we identify cases where uncertainty sampling is only slightly better than random sampling.", "keywords": "Active Learning;Deep Learning;Coreset;Deep Representation;Compression", "primary_area": "", "supplementary_material": "", "author": "Anonymous", "authorids": "ICLR.cc/2018/Conference/Paper718/Authors", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@article{\n anonymous2018deep,\n title={Deep Active Learning over the Long Tail},\n author={Anonymous},\n journal={International Conference on Learning Representations},\n year={2018},\n url={https://openreview.net/forum?id=S1XGhbW0Z}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ryOegkTXf", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 1, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 187, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14325044042761780166&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "title": "On the Convergence of Adam and Beyond", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/78", "id": "ryQu7f-RZ", "author_site": "Sashank Reddi, Satyen Kale, Sanjiv Kumar", "tldr": "We investigate the convergence of popular optimization algorithms like Adam , RMSProp and propose new variants of these methods which provably converge to optimal solution in convex settings. ", "abstract": " Several recently proposed stochastic optimization methods that have been successfully used in training deep networks such as RMSProp, Adam, Adadelta, Nadam are based on using gradient updates scaled by square roots of exponential moving averages of squared past gradients. In many applications, e.g. learning with large output spaces, it has been empirically observed that these algorithms fail to converge to an optimal solution (or a critical point in nonconvex settings). We show that one cause for such failures is the exponential moving average used in the algorithms. We provide an explicit example of a simple convex optimization setting where Adam does not converge to the optimal solution, and describe the precise problems with the previous analysis of Adam algorithm. Our analysis suggests that the convergence issues can be fixed by endowing such algorithms with ``long-term memory'' of past gradients, and propose new variants of the Adam algorithm which not only fix the convergence issues but often also lead to improved empirical performance.", "keywords": "optimization;deep learning;adam;rmsprop", "primary_area": "", "supplementary_material": "", "author": "Sashank J. Reddi;Satyen Kale;Sanjiv Kumar", "authorids": "sashank@google.com;satyenkale@google.com;sanjivk@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nj.2018on,\ntitle={On the Convergence of Adam and Beyond},\nauthor={Sashank J. Reddi and Satyen Kale and Sanjiv Kumar},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ryQu7f-RZ},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=ryQu7f-RZ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1", "pdf_size": 0, "rating": "8;8;9", "confidence": "4;3;5", "rating_avg": 8.333333333333334, "confidence_avg": 4.0, "replies_avg": 29, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844387, "gs_citation": 3446, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7572152545124305671&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=ryQu7f-RZ", "pdf": "https://openreview.net/pdf?id=ryQu7f-RZ", "email": ";;", "author_num": 3 }, { "id": "ryQz_ZfHz", "title": "Neural Variational Sparse Topic Model", "track": "main", "status": "Withdraw", "tldr": "a neural sparsity-enhanced topic model based on VAE", "abstract": "Effectively inferring discriminative and coherent latent topics of short texts is a critical task for many real world applications. Nevertheless, the task has been proven to be a great challenge for traditional topic models due to the data sparsity problem induced by the characteristics of short texts. Moreover, the complex inference algorithm also become a bottleneck for these traditional models to rapidly explore variations. In this paper, we propose a novel model called Neural Variational Sparse Topic Model (NVSTM) based on a sparsity-enhanced topic model named Sparse Topical Coding (STC). In the model, the auxiliary word embeddings are utilized to improve the generation of representations. The Variational Autoencoder (VAE) approach is applied to inference the model efficiently, which makes the model easy to explore extensions for its black-box inference process. Experimental results onWeb Snippets, 20Newsgroups, BBC and Biomedical datasets show the effectiveness and efficiency of the model.", "keywords": "Variational Autoencoder;Sparse Topical Coding;Neural Variational Inference", "primary_area": "", "supplementary_material": "", "author": "Anonymous", "authorids": "ICLR.cc/2018/Conference/Paper79/Authors", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@article{\n anonymous2018neural,\n title={Neural Variational Sparse Topic Model},\n author={Anonymous},\n journal={International Conference on Learning Representations},\n year={2018},\n url={https://openreview.net/forum?id=Sy33KapTb}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ryQz_ZfHz", "pdf_size": 0, "rating": "3;3;5", "confidence": "4;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.0, "replies_avg": 3, "authors#_avg": 1, "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "Multi-View Data Generation Without View Supervision", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/104", "id": "ryRh0bb0Z", "author_site": "Mickael Chen, Ludovic Denoyer, thierry artieres", "tldr": "We describe a novel multi-view generative model that can generate multiple views of the same object, or multiple objects in the same view with no need of label on views.", "abstract": "The development of high-dimensional generative models has recently gained a great surge of interest with the introduction of variational auto-encoders and generative adversarial neural networks. Different variants have been proposed where the underlying latent space is structured, for example, based on attributes describing the data to generate. We focus on a particular problem where one aims at generating samples corresponding to a number of objects under various views. We assume that the distribution of the data is driven by two independent latent factors: the content, which represents the intrinsic features of an object, and the view, which stands for the settings of a particular observation of that object. Therefore, we propose a generative model and a conditional variant built on such a disentangled latent space. This approach allows us to generate realistic samples corresponding to various objects in a high variety of views. Unlike many multi-view approaches, our model doesn't need any supervision on the views but only on the content. Compared to other conditional generation approaches that are mostly based on binary or categorical attributes, we make no such assumption about the factors of variations. Our model can be used on problems with a huge, potentially infinite, number of categories. We experiment it on four images datasets on which we demonstrate the effectiveness of the model and its ability to generalize. ", "keywords": "multi-view;adversarial learning;generative model", "primary_area": "", "supplementary_material": "", "author": "Mickael Chen;Ludovic Denoyer;Thierry Arti\u00e8res", "authorids": "mickael.chen@lip6.fr;ludovic.denoyer@lip6.fr;thierry.artieres@lif.univ-mrs.fr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nchen2018multiview,\ntitle={Multi-View Data Generation Without View Supervision},\nauthor={Mickael Chen and Ludovic Denoyer and Thierry Arti\u00e8res},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ryRh0bb0Z},\n}", "github": "[![github](/images/github_icon.svg) mickaelChen/GMV](https://github.com/mickaelChen/GMV)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;7;7", "confidence": "4;5;3", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15286827840377806140&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=ryRh0bb0Z", "pdf": "https://openreview.net/pdf?id=ryRh0bb0Z", "email": ";;", "author_num": 3 }, { "title": "Reinforcement Learning on Web Interfaces using Workflow-Guided Exploration", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/28", "id": "ryTp3f-0-", "author_site": "Evan Liu, Kelvin Guu, Panupong Pasupat, Tim Shi, Percy Liang", "tldr": "We solve the sparse rewards problem on web UI tasks using exploration guided by demonstrations", "abstract": "Reinforcement learning (RL) agents improve through trial-and-error, but when reward is sparse and the agent cannot discover successful action sequences, learning stagnates. This has been a notable problem in training deep RL agents to perform web-based tasks, such as booking flights or replying to emails, where a single mistake can ruin the entire sequence of actions. A common remedy is to \"warm-start\" the agent by pre-training it to mimic expert demonstrations, but this is prone to overfitting. Instead, we propose to constrain exploration using demonstrations. From each demonstration, we induce high-level \"workflows\" which constrain the allowable actions at each time step to be similar to those in the demonstration (e.g., \"Step 1: click on a textbox; Step 2: enter some text\"). Our exploration policy then learns to identify successful workflows and samples actions that satisfy these workflows. Workflows prune out bad exploration directions and accelerate the agent\u2019s ability to discover rewards. We use our approach to train a novel neural policy designed to handle the semi-structured nature of websites, and evaluate on a suite of web tasks, including the recent World of Bits benchmark. We achieve new state-of-the-art results, and show that workflow-guided exploration improves sample efficiency over behavioral cloning by more than 100x.", "keywords": "reinforcement learning;sparse rewards;web;exploration", "primary_area": "", "supplementary_material": "", "author": "Evan Zheran Liu;Kelvin Guu;Panupong Pasupat;Tianlin Shi;Percy Liang", "authorids": "evzliu@gmail.com;kguu@stanford.edu;ppasupat@cs.stanford.edu;tianlins@cs.stanford.edu;pliang@cs.stanford.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nzheran2018reinforcement,\ntitle={Reinforcement Learning on Web Interfaces using Workflow-Guided Exploration},\nauthor={Evan Zheran Liu and Kelvin Guu and Panupong Pasupat and Percy Liang},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ryTp3f-0-},\n}", "github": "[![github](/images/github_icon.svg) stanfordnlp/wge](https://github.com/stanfordnlp/wge) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=ryTp3f-0-)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;3;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 12, "authors#_avg": 5, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 234, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4315385157927012648&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=ryTp3f-0-", "pdf": "https://openreview.net/pdf?id=ryTp3f-0-", "email": ";;;;", "author_num": 5 }, { "title": "TRUNCATED HORIZON POLICY SEARCH: COMBINING REINFORCEMENT LEARNING & IMITATION LEARNING", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/33", "id": "ryUlhzWCZ", "author_site": "Wen Sun, J. A Bagnell, Byron Boots", "tldr": "Combining Imitation Learning and Reinforcement Learning to learn to outperform the expert", "abstract": "In this paper, we propose to combine imitation and reinforcement learning via the idea of reward shaping using an oracle. We study the effectiveness of the near- optimal cost-to-go oracle on the planning horizon and demonstrate that the cost- to-go oracle shortens the learner\u2019s planning horizon as function of its accuracy: a globally optimal oracle can shorten the planning horizon to one, leading to a one- step greedy Markov Decision Process which is much easier to optimize, while an oracle that is far away from the optimality requires planning over a longer horizon to achieve near-optimal performance. Hence our new insight bridges the gap and interpolates between imitation learning and reinforcement learning. Motivated by the above mentioned insights, we propose Truncated HORizon Policy Search (THOR), a method that focuses on searching for policies that maximize the total reshaped reward over a finite planning horizon when the oracle is sub-optimal. We experimentally demonstrate that a gradient-based implementation of THOR can achieve superior performance compared to RL baselines and IL baselines even when the oracle is sub-optimal.", "keywords": "Imitation Learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Wen Sun;J. Andrew Bagnell;Byron Boots", "authorids": "wensun@cs.cmu.edu;dbagnell@cs.cmu.edu;bboots@cc.gatech.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nsun2018truncated,\ntitle={{TRUNCATED} {HORIZON} {POLICY} {SEARCH}: {DEEP} {COMBINATION} {OF} {REINFORCEMENT} {AND} {IMITATION}},\nauthor={Wen Sun and J. Andrew Bagnell and Byron Boots},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ryUlhzWCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "3;6;7", "confidence": "5;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": -0.9607689228305228, "gs_citation": 112, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=782954345548959677&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=ryUlhzWCZ", "pdf": "https://openreview.net/pdf?id=ryUlhzWCZ", "email": ";;", "author_num": 3 }, { "id": "ryY4RhkCZ", "title": "DEEP DENSITY NETWORKS AND UNCERTAINTY IN RECOMMENDER SYSTEMS", "track": "main", "status": "Reject", "tldr": "We have introduced Deep Density Network, a unified DNN model to estimate uncertainty for exploration/exploitation in recommendation systems.", "abstract": "Building robust online content recommendation systems requires learning com- plex interactions between user preferences and content features. The field has evolved rapidly in recent years from traditional multi-arm bandit and collabora- tive filtering techniques, with new methods integrating Deep Learning models that enable to capture non-linear feature interactions. Despite progress, the dynamic nature of online recommendations still poses great challenges, such as finding the delicate balance between exploration and exploitation. In this paper we provide a novel method, Deep Density Networks (DDN) which deconvolves measurement and data uncertainty and predicts probability densities of CTR, enabling us to perform more efficient exploration of the feature space. We show the usefulness of using DDN online in a real world content recommendation system that serves billions of recommendations per day, and present online and offline results to eval- uate the benefit of using DDN.", "keywords": "deep learning;recommendation system;uncertainty;context-based and collaborative filtering", "primary_area": "", "supplementary_material": "", "author": "Yoel Zeldes;Stavros Theodorakis;Efrat Solodnik;Aviv Rotman;Gil Chamiel;Dan Friedman", "authorids": "yoel.z@taboola.com;sth@deeplab.ai;efrat.s@taboola.com;aviv.r@taboola.com;gil.c@taboola.com;dan.f@taboola.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nzeldes2018deep,\ntitle={{DEEP} {DENSITY} {NETWORKS} {AND} {UNCERTAINTY} {IN} {RECOMMENDER} {SYSTEMS}},\nauthor={Yoel Zeldes and Stavros Theodorakis and Efrat Solodnik and Aviv Rotman and Gil Chamiel and Dan Friedman},\nyear={2018},\nurl={https://openreview.net/forum?id=ryY4RhkCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ryY4RhkCZ", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;5;3", "rating_avg": 3.6666666666666665, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2395325973812168273&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "ryZ283gAZ", "title": "Beyond Finite Layer Neural Networks: Bridging Deep Architectures and Numerical Differential Equations", "track": "main", "status": "Workshop", "tldr": "This paper bridges deep network architectures with numerical (stochastic) differential equations. This new perspective enables new designs of more effective deep neural networks.", "abstract": "Deep neural networks have become the state-of-the-art models in numerous machine learning tasks. However, general guidance to network architecture design is still missing. In our work, we bridge deep neural network design with numerical differential equations. We show that many effective networks, such as ResNet, PolyNet, FractalNet and RevNet, can be interpreted as different numerical discretizations of differential equations. This finding brings us a brand new perspective on the design of effective deep architectures. We can take advantage of the rich knowledge in numerical analysis to guide us in designing new and potentially more effective deep networks. As an example, we propose a linear multi-step architecture (LM-architecture) which is inspired by the linear multi-step method solving ordinary differential equations. The LM-architecture is an effective structure that can be used on any ResNet-like networks. In particular, we demonstrate that LM-ResNet and LM-ResNeXt (i.e. the networks obtained by applying the LM-architecture on ResNet and ResNeXt respectively) can achieve noticeably higher accuracy than ResNet and ResNeXt on both CIFAR and ImageNet with comparable numbers of trainable parameters. In particular, on both CIFAR and ImageNet, LM-ResNet/LM-ResNeXt can significantly compress (>50%) the original networks while maintaining a similar performance. This can be explained mathematically using the concept of modified equation from numerical analysis. Last but not least, we also establish a connection between stochastic control and noise injection in the training process which helps to improve generalization of the networks. Furthermore, by relating stochastic training strategy with stochastic dynamic system, we can easily apply stochastic training to the networks with the LM-architecture. As an example, we introduced stochastic depth to LM-ResNet and achieve significant improvement over the original LM-ResNet on CIFAR10.", "keywords": "deep convolutional network;residual network;dynamic system;stochastic dynamic system;modified equation", "primary_area": "", "supplementary_material": "", "author": "Yiping Lu;Aoxiao Zhong;Quanzheng Li;Bin Dong", "authorids": "luyiping9712@pku.edu.cn;zhongaoxiao@gmail.com;quanzhengli5@gmail.com;dongbin@math.pku.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlu2018beyond,\ntitle={Beyond Finite Layer Neural Networks: Bridging Deep Architectures and Numerical Differential Equations},\nauthor={Yiping Lu and Aoxiao Zhong and Quanzheng Li and Bin Dong},\nyear={2018},\nurl={https://openreview.net/forum?id=ryZ283gAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=ryZ283gAZ", "pdf_size": 0, "rating": "5;5;6;7", "confidence": "3;1;1;4", "rating_avg": 5.75, "confidence_avg": 2.25, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": 0.5222329678670935, "gs_citation": 674, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15552677819794260504&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15 }, { "id": "ryZ3KCy0W", "title": "Link Weight Prediction with Node Embeddings", "track": "main", "status": "Reject", "tldr": "", "abstract": "Application of deep learning has been successful in various domains such as im-\nage recognition, speech recognition and natural language processing. However,\nthe research on its application in graph mining is still in an early stage. Here we\npresent the first generic deep learning approach to the graph link weight prediction\nproblem based on node embeddings. We evaluate this approach with three differ-\nent node embedding techniques experimentally and compare its performance with\ntwo state-of-the-art non deep learning baseline approaches. Our experiment re-\nsults suggest that this deep learning approach outperforms the baselines by up to\n70% depending on the dataset and embedding technique applied. This approach\nshows that deep learning can be successfully applied to link weight prediction to\nimprove prediction accuracy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuchen Hou;Lawrence B. Holder", "authorids": "yuchen.hou@wsu.edu;holder@wsu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nhou2018link,\ntitle={Link Weight Prediction with Node Embeddings},\nauthor={Yuchen Hou and Lawrence B. Holder},\nyear={2018},\nurl={https://openreview.net/forum?id=ryZ3KCy0W},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ryZ3KCy0W", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;4;3", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11813551185154882165&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0 }, { "id": "ryZ8sz-Ab", "title": "Fast and Accurate Text Classification: Skimming, Rereading and Early Stopping", "track": "main", "status": "Workshop", "tldr": "We develop an end-to-end trainable approach for skimming, rereading and early stopping applicable to classification tasks. ", "abstract": "Recent advances in recurrent neural nets (RNNs) have shown much promise in many applications in natural language processing. For most of these tasks, such as sentiment analysis of customer reviews, a recurrent neural net model parses the entire review before forming a decision. We argue that reading the entire input is not always necessary in practice, since a lot of reviews are often easy to classify, i.e., a decision can be formed after reading some crucial sentences or words in the provided text. In this paper, we present an approach of fast reading for text classification. Inspired by several well-known human reading techniques, our approach implements an intelligent recurrent agent which evaluates the importance of the current snippet in order to decide whether to make a prediction, or to skip some texts, or to re-read part of the sentence. Our agent uses an RNN module to encode information from the past and the current tokens, and applies a policy module to form decisions. With an end-to-end training algorithm based on policy gradient, we train and test our agent on several text classification datasets and achieve both higher efficiency and better accuracy compared to previous approaches. \n", "keywords": "Topic Classification;Sentiment Analysis;Natural Language Processing", "primary_area": "", "supplementary_material": "", "author": "Keyi Yu;Yang Liu;Alexander G. Schwing;Jian Peng", "authorids": "yu-ky14@mails.tsinghua.edu.cn;liu301@illinois.edu;aschwing@illinois.edu;jianpeng@illinois.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nyu2018fast,\ntitle={Fast and Accurate Text Classification: Skimming, Rereading and Early Stopping},\nauthor={Keyi Yu and Yang Liu and Alexander G. Schwing and Jian Peng},\nyear={2018},\nurl={https://openreview.net/forum?id=ryZ8sz-Ab},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ryZ8sz-Ab", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;3;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14990680647694185204&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "ryZERzWCZ", "title": "The Information-Autoencoding Family: A Lagrangian Perspective on Latent Variable Generative Modeling", "track": "main", "status": "Reject", "tldr": "", "abstract": "A variety of learning objectives have been recently proposed for training generative models. We show that many of them, including InfoGAN, ALI/BiGAN, ALICE, CycleGAN, VAE, $\\beta$-VAE, adversarial autoencoders, AVB, and InfoVAE, are Lagrangian duals of the same primal optimization problem. This generalization reveals the implicit modeling trade-offs between flexibility and computational requirements being made by these models. Furthermore, we characterize the class of all objectives that can be optimized under certain computational constraints.\nFinally, we show how this new Lagrangian perspective can explain undesirable behavior of existing methods and provide new principled solutions.", "keywords": "Generative Models;Variational Autoencoder;Generative Adversarial Network", "primary_area": "", "supplementary_material": "", "author": "Shengjia Zhao;Jiaming Song;Stefano Ermon", "authorids": "sjzhao@stanford.edu;tsong@cs.stanford.edu;ermon@cs.stanford.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzhao2018the,\ntitle={The Information-Autoencoding Family: A Lagrangian Perspective on Latent Variable Generative Modeling},\nauthor={Shengjia Zhao and Jiaming Song and Stefano Ermon},\nyear={2018},\nurl={https://openreview.net/forum?id=ryZERzWCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ryZERzWCZ", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 81, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2727235418070867606&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "ryZElGZ0Z", "title": "Discovery of Predictive Representations With a Network of General Value Functions", "track": "main", "status": "Reject", "tldr": "We investigate a framework for discovery: curating a large collection of predictions, which are used to construct the agent\u2019s representation in partially observable domains.", "abstract": "The ability of an agent to {\\em discover} its own learning objectives has long been considered a key ingredient for artificial general intelligence. Breakthroughs in autonomous decision making and reinforcement learning have primarily been in domains where the agent's goal is outlined and clear: such as playing a game to win, or driving safely. Several studies have demonstrated that learning extramural sub-tasks and auxiliary predictions can improve (1) single human-specified task learning, (2) transfer of learning, (3) and the agent's learned representation of the world. In all these examples, the agent was instructed what to learn about. We investigate a framework for discovery: curating a large collection of predictions, which are used to construct the agent's representation of the world. Specifically, our system maintains a large collection of predictions, continually pruning and replacing predictions. We highlight the importance of considering stability rather than convergence for such a system, and develop an adaptive, regularized algorithm towards that aim. We provide several experiments in computational micro-worlds demonstrating that this simple approach can be effective for discovering useful predictions autonomously.", "keywords": "Reinforcement Learning;General Value Functions;Predictive Representations", "primary_area": "", "supplementary_material": "", "author": "Matthew Schlegel;Andrew Patterson;Adam White;Martha White", "authorids": "mkschleg@ualberta.ca;andnpatt@indiana.edu;amw8@ualberta.ca;whitem@ualberta.ca", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nschlegel2018discovery,\ntitle={Discovery of Predictive Representations With a Network of General Value Functions},\nauthor={Matthew Schlegel and Andrew Patterson and Adam White and Martha White},\nyear={2018},\nurl={https://openreview.net/forum?id=ryZElGZ0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ryZElGZ0Z", "pdf_size": 0, "rating": "4;4;5", "confidence": "1;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 3.0, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 0.49999999999999994, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12941806050597772067&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "On the Information Bottleneck Theory of Deep Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/63", "id": "ry_WPG-A-", "author_site": "Andrew Saxe, Yamini Bansal, Joel Dapello, Madhu Advani, Artemy Kolchinsky, Brendan D Tracey, David D Cox", "tldr": "We show that several claims of the information bottleneck theory of deep learning are not true in the general case.", "abstract": "The practical successes of deep neural networks have not been matched by theoretical progress that satisfyingly explains their behavior. In this work, we study the information bottleneck (IB) theory of deep learning, which makes three specific claims: first, that deep networks undergo two distinct phases consisting of an initial fitting phase and a subsequent compression phase; second, that the compression phase is causally related to the excellent generalization performance of deep networks; and third, that the compression phase occurs due to the diffusion-like behavior of stochastic gradient descent. Here we show that none of these claims hold true in the general case. Through a combination of analytical results and simulation, we demonstrate that the information plane trajectory is predominantly a function of the neural nonlinearity employed: double-sided saturating nonlinearities like tanh yield a compression phase as neural activations enter the saturation regime, but linear activation functions and single-sided saturating nonlinearities like the widely used ReLU in fact do not. Moreover, we find that there is no evident causal connection between compression and generalization: networks that do not compress are still capable of generalization, and vice versa. Next, we show that the compression phase, when it exists, does not arise from stochasticity in training by demonstrating that we can replicate the IB findings using full batch gradient descent rather than stochastic gradient descent. Finally, we show that when an input domain consists of a subset of task-relevant and task-irrelevant information, hidden representations do compress the task-irrelevant information, although the overall information about the input may monotonically increase with training time, and that this compression happens concurrently with the fitting process rather than during a subsequent compression period.", "keywords": "information bottleneck;deep learning;deep linear networks", "primary_area": "", "supplementary_material": "", "author": "Andrew Michael Saxe;Yamini Bansal;Joel Dapello;Madhu Advani;Artemy Kolchinsky;Brendan Daniel Tracey;David Daniel Cox", "authorids": "asaxe@fas.harvard.edu;ybansal@g.harvard.edu;dapello@g.harvard.edu;madvani@fas.harvard.edu;artemyk@gmail.com;tracey.brendan@gmail.com;davidcox@fas.harvard.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nmichael2018on,\ntitle={On the Information Bottleneck Theory of Deep Learning},\nauthor={Andrew Michael Saxe and Yamini Bansal and Joel Dapello and Madhu Advani and Artemy Kolchinsky and Brendan Daniel Tracey and David Daniel Cox},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ry_WPG-A-},\n}", "github": "[![github](/images/github_icon.svg) artemyk/ibsgd](https://github.com/artemyk/ibsgd)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "2;3;3", "rating_avg": 6.666666666666667, "confidence_avg": 2.6666666666666665, "replies_avg": 21, "authors#_avg": 7, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 723, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12271240925674881982&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "openreview": "https://openreview.net/forum?id=ry_WPG-A-", "pdf": "https://openreview.net/pdf?id=ry_WPG-A-", "email": ";;;;;;", "author_num": 7 }, { "id": "ryacTMZRZ", "title": "Jiffy: A Convolutional Approach to Learning Time Series Similarity", "track": "main", "status": "Reject", "tldr": "Jiffy is a convolutional approach to learning a distance metric for multivariate time series that outperforms existing methods in terms of nearest-neighbor classification accuracy.", "abstract": "Computing distances between examples is at the core of many learning algorithms for time series. Consequently, a great deal of work has gone into designing effective time series distance measures. We present Jiffy, a simple and scalable distance metric for multivariate time series. Our approach is to reframe the task as a representation learning problem---rather than design an elaborate distance function, we use a CNN to learn an embedding such that the Euclidean distance is effective. By aggressively max-pooling and downsampling, we are able to construct this embedding using a highly compact neural network. Experiments on a diverse set of multivariate time series datasets show that our approach consistently outperforms existing methods.", "keywords": "Time Series;Time Series Classification", "primary_area": "", "supplementary_material": "", "author": "Divya Shanmugam;Davis Blalock;John Guttag", "authorids": "divyas@mit.edu;dblalock@mit.edu;jguttag@mit.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nshanmugam2018jiffy,\ntitle={Jiffy: A Convolutional Approach to Learning Time Series Similarity},\nauthor={Divya Shanmugam and Davis Blalock and John Guttag},\nyear={2018},\nurl={https://openreview.net/forum?id=ryacTMZRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ryacTMZRZ", "pdf_size": 0, "rating": "4;6;8", "confidence": "4;4;3", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=75188469724087308&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Communication Algorithms via Deep Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/20", "id": "ryazCMbR-", "author_site": "Hyeji Kim, Yihan Jiang, Ranvir B Rana, Sreeram Kannan, Sewoong Oh, Pramod Viswanath", "tldr": "We show that creatively designed and trained RNN architectures can decode well known sequential codes and achieve close to optimal performances.", "abstract": "Coding theory is a central discipline underpinning wireline and wireless modems that are the workhorses of the information age. Progress in coding theory is largely driven by individual human ingenuity with sporadic breakthroughs over the past century. In this paper we study whether it is possible to automate the discovery of decoding algorithms via deep learning. We study a family of sequential codes parametrized by recurrent neural network (RNN) architectures. We show that cre- atively designed and trained RNN architectures can decode well known sequential codes such as the convolutional and turbo codes with close to optimal performance on the additive white Gaussian noise (AWGN) channel, which itself is achieved by breakthrough algorithms of our times (Viterbi and BCJR decoders, representing dynamic programing and forward-backward algorithms). We show strong gen- eralizations, i.e., we train at a specific signal to noise ratio and block length but test at a wide range of these quantities, as well as robustness and adaptivity to deviations from the AWGN setting.", "keywords": "coding theory;recurrent neural network;communication", "primary_area": "", "supplementary_material": "", "author": "Hyeji Kim;Yihan Jiang;Ranvir B. Rana;Sreeram Kannan;Sewoong Oh;Pramod Viswanath", "authorids": "hyejikim@illinois.edu;yihanrogerjiang@gmail.com;rbrana2@illinois.edu;ksreeram@uw.edu;sewoong79@gmail.com;pramodv@illinois.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nkim2018communication,\ntitle={Communication Algorithms via Deep Learning},\nauthor={Hyeji Kim and Yihan Jiang and Ranvir B. Rana and Sreeram Kannan and Sewoong Oh and Pramod Viswanath},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ryazCMbR-},\n}", "github": "[![github](/images/github_icon.svg) yihanjiang/Sequential-RNN-Decoder](https://github.com/yihanjiang/Sequential-RNN-Decoder) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=ryazCMbR-)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "2;6;9", "confidence": "4;4;5", "rating_avg": 5.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 31, "authors#_avg": 6, "corr_rating_confidence": 0.8219949365267863, "gs_citation": 289, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3745511757842142493&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=ryazCMbR-", "pdf": "https://openreview.net/pdf?id=ryazCMbR-", "email": ";;;;;", "author_num": 6 }, { "id": "ryb83alCZ", "title": "Towards Unsupervised Classification with Deep Generative Models", "track": "main", "status": "Reject", "tldr": "Unsupervised classification via deep generative modeling with controllable feature learning evaluated in a difficult real world task", "abstract": "Deep generative models have advanced the state-of-the-art in semi-supervised classification, however their capacity for deriving useful discriminative features in a completely unsupervised fashion for classification in difficult real-world data sets, where adequate manifold separation is required has not been adequately explored. Most methods rely on defining a pipeline of deriving features via generative modeling and then applying clustering algorithms, separating the modeling and discriminative processes. We propose a deep hierarchical generative model which uses a mixture of discrete and continuous distributions to learn to effectively separate the different data manifolds and is trainable end-to-end. We show that by specifying the form of the discrete variable distribution we are imposing a specific structure on the model's latent representations. We test our model's discriminative performance on the task of CLL diagnosis against baselines from the field of computational FC, as well as the Variational Autoencoder literature.", "keywords": "variational inference;vae;variational autoencoders;generative modeling;representation learning;classification", "primary_area": "", "supplementary_material": "", "author": "Dimitris Kalatzis;Konstantia Kotta;Ilias Kalamaras;Anastasios Vafeiadis;Andrew Rawstron;Dimitris Tzovaras;Kostas Stamatopoulos", "authorids": "dkal@iti.gr;ntina_kotta@yahoo.com;kalamar@iti.gr;anasvaf@iti.gr;a.c.rawstron@leeds.ac.uk;dimitrios.tzovaras@iti.gr;kostas.stamatopoulos@gmail.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nkalatzis2018towards,\ntitle={Towards Unsupervised Classification with Deep Generative Models},\nauthor={Dimitris Kalatzis and Konstantia Kotta and Ilias Kalamaras and Anastasios Vafeiadis and Andrew Rawstron and Dimitris Tzovaras and Kostas Stamatopoulos},\nyear={2018},\nurl={https://openreview.net/forum?id=ryb83alCZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=ryb83alCZ", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;5;4", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dIxAWelFdXoJ:scholar.google.com/&scioq=Towards+Unsupervised+Classification+with+Deep+Generative+Models&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "rybAWfx0b", "title": "COLD FUSION: TRAINING SEQ2SEQ MODELS TOGETHER WITH LANGUAGE MODELS", "track": "main", "status": "Workshop", "tldr": "We introduce a novel method to train Seq2Seq models with language models that converge faster, generalize better and can almost completely transfer to a new domain using less than 10% of labeled data.", "abstract": "Sequence-to-sequence (Seq2Seq) models with attention have excelled at tasks which involve generating natural language sentences such as machine translation, image captioning and speech recognition. Performance has further been improved by leveraging unlabeled data, often in the form of a language model. In this work, we present the Cold Fusion method, which leverages a pre-trained language model during training, and show its effectiveness on the speech recognition task. We show that Seq2Seq models with Cold Fusion are able to better utilize language information enjoying i) faster convergence and better generalization, and ii) almost complete transfer to a new domain while using less than 10% of the labeled training data.", "keywords": "Sequence-to-Sequence Models;Speech Recognition;Language Models", "primary_area": "", "supplementary_material": "", "author": "Anuroop Sriram;Heewoo Jun;Sanjeev Satheesh;Adam Coates", "authorids": "anuroop.sriram@gmail.com;junheewoo@baidu.com;sanjeevsatheesh@baidu.com;adamcoates@baidu.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsriram2018cold,\ntitle={{COLD} {FUSION}: {TRAINING} {SEQ}2{SEQ} {MODELS} {TOGETHER} {WITH} {LANGUAGE} {MODELS}},\nauthor={Anuroop Sriram and Heewoo Jun and Sanjeev Satheesh and Adam Coates},\nyear={2018},\nurl={https://openreview.net/forum?id=rybAWfx0b},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rybAWfx0b", "pdf_size": 0, "rating": "5;5;6", "confidence": "5;5;5", "rating_avg": 5.333333333333333, "confidence_avg": 5.0, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 358, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5249519533302773894&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "rybDdHe0Z", "title": "Sequence Transfer Learning for Neural Decoding", "track": "main", "status": "Reject", "tldr": "", "abstract": "A fundamental challenge in designing brain-computer interfaces (BCIs) is decoding behavior from time-varying neural oscillations. In typical applications, decoders are constructed for individual subjects and with limited data leading to restrictions on the types of models that can be utilized. Currently, the best performing decoders are typically linear models capable of utilizing rigid timing constraints with limited training data. Here we demonstrate the use of Long Short-Term Memory (LSTM) networks to take advantage of the temporal information present in sequential neural data collected from subjects implanted with electrocorticographic (ECoG) electrode arrays performing a finger flexion task. Our constructed models are capable of achieving accuracies that are comparable to existing techniques while also being robust to variation in sample data size. Moreover, we utilize the LSTM networks and an affine transformation layer to construct a novel architecture for transfer learning. We demonstrate that in scenarios where only the affine transform is learned for a new subject, it is possible to achieve results comparable to existing state-of-the-art techniques. The notable advantage is the increased stability of the model during training on novel subjects. Relaxing the constraint of only training the affine transformation, we establish our model as capable of exceeding performance of current models across all training data sizes. Overall, this work demonstrates that LSTMs are a versatile model that can accurately capture temporal patterns in neural data and can provide a foundation for transfer learning in neural decoding.", "keywords": "Transfer Learning;Applications;Neural decoding", "primary_area": "", "supplementary_material": "", "author": "Venkatesh Elango*;Aashish N Patel*;Kai J Miller;Vikash Gilja", "authorids": "velango@eng.ucsd.edu;anp054@eng.ucsd.edu;kai.miller@stanford.edu;vgilja@eng.ucsd.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nelango*2018sequence,\ntitle={Sequence Transfer Learning for Neural Decoding},\nauthor={Venkatesh Elango* and Aashish N Patel* and Kai J Miller and Vikash Gilja},\nyear={2018},\nurl={https://openreview.net/forum?id=rybDdHe0Z},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rybDdHe0Z", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;5;5", "rating_avg": 4.333333333333333, "confidence_avg": 4.666666666666667, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": 0.7559289460184544, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10433845078652478476&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "SMASH: One-Shot Model Architecture Search through HyperNetworks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/338", "id": "rydeCEhs-", "author_site": "Andrew Brock, Theo Lim, James Ritchie, Nick Weston", "tldr": "A technique for accelerating neural architecture selection by approximating the weights of each candidate architecture instead of training them individually.", "abstract": "Designing architectures for deep neural networks requires expert knowledge and substantial computation time. We propose a technique to accelerate architecture selection by learning an auxiliary HyperNet that generates the weights of a main model conditioned on that model's architecture. By comparing the relative validation performance of networks with HyperNet-generated weights, we can effectively search over a wide range of architectures at the cost of a single training run. To facilitate this search, we develop a flexible mechanism based on memory read-writes that allows us to define a wide range of network connectivity patterns, with ResNet, DenseNet, and FractalNet blocks as special cases. We validate our method (SMASH) on CIFAR-10 and CIFAR-100, STL-10, ModelNet10, and Imagenet32x32, achieving competitive performance with similarly-sized hand-designed networks.", "keywords": "meta-learning;architecture search;deep learning;computer vision", "primary_area": "", "supplementary_material": "", "author": "Andrew Brock;Theo Lim;J.M. Ritchie;Nick Weston", "authorids": "ajb5@hw.ac.uk;t.lim@hw.ac.uk;j.m.ritchie@hw.ac.uk;nick.weston@renishaw.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nbrock2018smash,\ntitle={{SMASH}: One-Shot Model Architecture Search through HyperNetworks},\nauthor={Andrew Brock and Theo Lim and J.M. Ritchie and Nick Weston},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rydeCEhs-},\n}", "github": "[![github](/images/github_icon.svg) ajbrock/SMASH](https://github.com/ajbrock/SMASH)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "2;4;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.0, "replies_avg": 6, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 925, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10456857144668119976&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=rydeCEhs-", "pdf": "https://openreview.net/pdf?id=rydeCEhs-", "email": ";;;", "author_num": 4 }, { "id": "rye7IMbAZ", "title": "Explicit Induction Bias for Transfer Learning with Convolutional Networks", "track": "main", "status": "Reject", "tldr": "In inductive transfer learning, fine-tuning pre-trained convolutional networks substantially outperforms training from scratch.", "abstract": "In inductive transfer learning, fine-tuning pre-trained convolutional networks substantially outperforms training from scratch.\nWhen using fine-tuning, the underlying assumption is that the pre-trained model extracts generic features, which are at least partially relevant for solving the target task, but would be difficult to extract from the limited amount of data available on the target task.\nHowever, besides the initialization with the pre-trained model and the early stopping, there is no mechanism in fine-tuning for retaining the features learned on the source task.\nIn this paper, we investigate several regularization schemes that explicitly promote the similarity of the final solution with the initial model.\nWe eventually recommend a simple $L^2$ penalty using the pre-trained model as a reference, and we show that this approach behaves much better than the standard scheme using weight decay on a partially frozen network.", "keywords": "transfer Learning;convolutional networks;fine-tuning;regularization;induction bias", "primary_area": "", "supplementary_material": "", "author": "Xuhong LI;Yves GRANDVALET;Franck DAVOINE", "authorids": "xuhong.li@utc.fr;yves.grandvalet@utc.fr;franck.davoine@utc.fr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nli2018,\ntitle={ Explicit Induction Bias for Transfer Learning with Convolutional Networks},\nauthor={Xuhong LI and Yves GRANDVALET and Franck DAVOINE},\nyear={2018},\nurl={https://openreview.net/forum?id=rye7IMbAZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rye7IMbAZ", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;5;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 413, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8930504204220703046&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "ryepFJbA-", "title": "On Convergence and Stability of GANs", "track": "main", "status": "Reject", "tldr": "Analysis of convergence and mode collapse by studying GAN training process as regret minimization", "abstract": "We propose studying GAN training dynamics as regret minimization, which is in contrast to the popular view that there is consistent minimization of a divergence between real and generated distributions. We analyze the convergence of GAN training from this new point of view to understand why mode collapse happens. We hypothesize the existence of undesirable local equilibria in this non-convex game to be responsible for mode collapse. We observe that these local equilibria often exhibit sharp gradients of the discriminator function around some real data points. We demonstrate that these degenerate local equilibria can be avoided with a gradient penalty scheme called DRAGAN. We show that DRAGAN enables faster training, achieves improved stability with fewer mode collapses, and leads to generator networks with better modeling performance across a variety of architectures and objective functions.", "keywords": "GAN;Generative Adversarial Networks;Mode Collapse;Stability;Game Theory;Regret Minimization;Convergence;Gradient Penalty", "primary_area": "", "supplementary_material": "", "author": "Naveen Kodali;James Hays;Jacob Abernethy;Zsolt Kira", "authorids": "nkodali3@gatech.edu;hays@gatech.edu;prof@gatech.edu;zkira@gatech.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nkodali2018on,\ntitle={On Convergence and Stability of {GAN}s},\nauthor={Naveen Kodali and James Hays and Jacob Abernethy and Zsolt Kira},\nyear={2018},\nurl={https://openreview.net/forum?id=ryepFJbA-},\n}", "github": "[![github](/images/github_icon.svg) kodalinaveen3/DRAGAN](https://github.com/kodalinaveen3/DRAGAN) + [![Papers with Code](/images/pwc_icon.svg) 7 community implementations](https://paperswithcode.com/paper/?openreview=ryepFJbA-)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ryepFJbA-", "pdf_size": 0, "rating": "3;4;5", "confidence": "3;5;2", "rating_avg": 4.0, "confidence_avg": 3.3333333333333335, "replies_avg": 27, "authors#_avg": 4, "corr_rating_confidence": -0.3273268353539886, "gs_citation": 768, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17817086201379529858&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "title": "Training Confidence-calibrated Classifiers for Detecting Out-of-Distribution Samples", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/225", "id": "ryiAv2xAZ", "author_site": "Kimin Lee, Honglak Lee, Kibok Lee, Jinwoo Shin", "tldr": "", "abstract": "The problem of detecting whether a test sample is from in-distribution (i.e., training distribution by a classifier) or out-of-distribution sufficiently different from it arises in many real-world machine learning applications. However, the state-of-art deep neural networks are known to be highly overconfident in their predictions, i.e., do not distinguish in- and out-of-distributions. Recently, to handle this issue, several threshold-based detectors have been proposed given pre-trained neural classifiers. However, the performance of prior works highly depends on how to train the classifiers since they only focus on improving inference procedures. In this paper, we develop a novel training method for classifiers so that such inference algorithms can work better. In particular, we suggest two additional terms added to the original loss (e.g., cross entropy). The first one forces samples from out-of-distribution less confident by the classifier and the second one is for (implicitly) generating most effective training samples for the first one. In essence, our method jointly trains both classification and generative neural networks for out-of-distribution. We demonstrate its effectiveness using deep convolutional neural networks on various popular image datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kimin Lee;Honglak Lee;Kibok Lee;Jinwoo Shin", "authorids": "kiminlee@kaist.ac.kr;honglak@eecs.umich.edu;kibok@umich.edu;jinwoos@kaist.ac.kr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nlee2018training,\ntitle={Training Confidence-calibrated Classifiers for Detecting Out-of-Distribution Samples},\nauthor={Kimin Lee and Honglak Lee and Kibok Lee and Jinwoo Shin},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ryiAv2xAZ},\n}", "github": "[![github](/images/github_icon.svg) alinlab/Confident_classifier](https://github.com/alinlab/Confident_classifier) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=ryiAv2xAZ)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;4;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1102, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14294577348397503039&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=ryiAv2xAZ", "pdf": "https://openreview.net/pdf?id=ryiAv2xAZ", "email": ";;;", "author_num": 4 }, { "id": "ryj0790hb", "title": "Incremental Learning through Deep Adaptation", "track": "main", "status": "Reject", "tldr": "An alternative to transfer learning that learns faster, requires much less parameters (3-13 %), usually achieves better results and precisely preserves performance on old tasks.", "abstract": "Given an existing trained neural network, it is often desirable to learn new capabilities without hindering performance of those already learned. Existing approaches either learn sub-optimal solutions, require joint training, or incur a substantial increment in the number of parameters for each added task, typically as many as the original network. We propose a method called Deep Adaptation Networks (DAN) that constrains newly learned filters to be linear combinations of existing ones. DANs preserve performance on the original task, require a fraction (typically 13%) of the number of parameters compared to standard fine-tuning procedures and converge in less cycles of training to a comparable or better level of performance. When coupled with standard network quantization techniques, we further reduce the parameter cost to around 3% of the original with negligible or no loss in accuracy. The learned architecture can be controlled to switch between various learned representations, enabling a single network to solve a task from multiple different domains. We conduct extensive experiments showing the effectiveness of our method on a range of image classification tasks and explore different aspects of its behavior.", "keywords": "Transfer Learning;Learning without forgetting;Multitask Learning", "primary_area": "", "supplementary_material": "", "author": "Amir Rosenfeld;John K. Tsotsos", "authorids": "amir.rosenfeld@gmail.com;amir.rosenfeld@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nrosenfeld2018incremental,\ntitle={Incremental Learning through Deep Adaptation},\nauthor={Amir Rosenfeld and John K. Tsotsos},\nyear={2018},\nurl={https://openreview.net/forum?id=ryj0790hb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ryj0790hb", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 334, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17587505718183561767&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "ryj38zWRb", "title": "Optimizing the Latent Space of Generative Networks", "track": "main", "status": "Reject", "tldr": "Are GANs successful because of adversarial training or the use of ConvNets? We show a ConvNet generator trained with a simple reconstruction loss and learnable noise vectors leads many of the desirable properties of a GAN.", "abstract": "Generative Adversarial Networks (GANs) have achieved remarkable results in the task of generating realistic natural images. In most applications, GAN models share two aspects in common. On the one hand, GANs training involves solving a challenging saddle point optimization problem, interpreted as an adversarial game between a generator and a discriminator functions. On the other hand, the generator and the discriminator are parametrized in terms of deep convolutional neural networks. The goal of this paper is to disentangle the contribution of these two factors to the success of GANs. In particular, we introduce Generative Latent Optimization (GLO), a framework to train deep convolutional generators without using discriminators, thus avoiding the instability of adversarial optimization problems. Throughout a variety of experiments, we show that GLO enjoys many of the desirable properties of GANs: learning from large data, synthesizing visually-appealing samples, interpolating meaningfully between samples, and performing linear arithmetic with noise vectors.", "keywords": "generative models;latent variable models;image generation;generative adversarial networks;convolutional neural networks", "primary_area": "", "supplementary_material": "", "author": "Piotr Bojanowski;Armand Joulin;David Lopez-Paz;Arthur Szlam", "authorids": "bojanowski@fb.com;ajoulin@fb.com;dlp@fb.com;aszlam@fb.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbojanowski2018optimizing,\ntitle={Optimizing the Latent Space of Generative Networks},\nauthor={Piotr Bojanowski and Armand Joulin and David Lopez-Paz and Arthur Szlam},\nyear={2018},\nurl={https://openreview.net/forum?id=ryj38zWRb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ryj38zWRb", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 542, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7705272916319621438&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "ryjw_eAaZ", "title": "Unsupervised Deep Structure Learning by Recursive Dependency Analysis", "track": "main", "status": "Reject", "tldr": "A principled approach for structure learning of deep neural networks with a new interpretation for depth and inter-layer connectivity. ", "abstract": "We introduce an unsupervised structure learning algorithm for deep, feed-forward, neural networks. We propose a new interpretation for depth and inter-layer connectivity where a hierarchy of independencies in the input distribution is encoded in the network structure. This results in structures allowing neurons to connect to neurons in any deeper layer skipping intermediate layers. Moreover, neurons in deeper layers encode low-order (small condition sets) independencies and have a wide scope of the input, whereas neurons in the first layers encode higher-order (larger condition sets) independencies and have a narrower scope. Thus, the depth of the network is automatically determined---equal to the maximal order of independence in the input distribution, which is the recursion-depth of the algorithm. The proposed algorithm constructs two main graphical models: 1) a generative latent graph (a deep belief network) learned from data and 2) a deep discriminative graph constructed from the generative latent graph. We prove that conditional dependencies between the nodes in the learned generative latent graph are preserved in the class-conditional discriminative graph. Finally, a deep neural network structure is constructed based on the discriminative graph. We demonstrate on image classification benchmarks that the algorithm replaces the deepest layers (convolutional and dense layers) of common convolutional networks, achieving high classification accuracy, while constructing significantly smaller structures. The proposed structure learning algorithm requires a small computational cost and runs efficiently on a standard desktop CPU.", "keywords": "unsupervised learning;structure learning;deep belief networks;probabilistic graphical models;Bayesian networks", "primary_area": "", "supplementary_material": "", "author": "Raanan Y. Yehezkel Rohekar;Guy Koren;Shami Nisimov;Gal Novik", "authorids": "raanan.y.yehezkel.rohekar@intel.com;guy.koren@intel.com;shami.nisimov@intel.com;gal.novik@intel.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ny.2018unsupervised,\ntitle={Unsupervised Deep Structure Learning by Recursive Dependency Analysis},\nauthor={Raanan Y. Yehezkel Rohekar and Guy Koren and Shami Nisimov and Gal Novik},\nyear={2018},\nurl={https://openreview.net/forum?id=ryjw_eAaZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=ryjw_eAaZ", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;2;3", "rating_avg": 4.666666666666667, "confidence_avg": 3.0, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ercg26l0N1gJ:scholar.google.com/&scioq=Unsupervised+Deep+Structure+Learning+by+Recursive+Dependency+Analysis&hl=en&as_sdt=0,33", "gs_version_total": 2 }, { "id": "ryk77mbRZ", "title": "Noise-Based Regularizers for Recurrent Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recurrent neural networks (RNNs) are powerful models for sequential data. They can approximate arbitrary computations, and have been used successfully in domains such as text and speech. However, the flexibility of RNNs makes them susceptible to overfitting and regularization is important. We develop a noise-based regularization method for RNNs. The idea is simple and easy to implement: we inject noise in the hidden units of the RNN and then maximize the original RNN's likelihood averaged over the injected noise. On a language modeling benchmark, our method achieves better performance than the deterministic RNN and the variational dropout.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Adji B. Dieng;Jaan Altosaar;Rajesh Ranganath;David M. Blei", "authorids": "abd2141@columbia.edu;altosaar@princeton.edu;rajeshr@cs.princeton.edu;david.blei@columbia.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nb.2018noisebased,\ntitle={Noise-Based Regularizers for Recurrent Neural Networks},\nauthor={Adji B. Dieng and Jaan Altosaar and Rajesh Ranganath and David M. Blei},\nyear={2018},\nurl={https://openreview.net/forum?id=ryk77mbRZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ryk77mbRZ", "pdf_size": 0, "rating": "2;3;5", "confidence": "5;3;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": -0.3273268353539886, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2187981823056029419&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "title": "On Unifying Deep Generative Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/166", "id": "rylSzl-R-", "author_site": "Zhiting Hu, , Ruslan Salakhutdinov, Eric P Xing", "tldr": "A unified statistical view of the broad class of deep generative models ", "abstract": "Deep generative models have achieved impressive success in recent years. Generative Adversarial Networks (GANs) and Variational Autoencoders (VAEs), as powerful frameworks for deep generative model learning, have largely been considered as two distinct paradigms and received extensive independent studies respectively. This paper aims to establish formal connections between GANs and VAEs through a new formulation of them. We interpret sample generation in GANs as performing posterior inference, and show that GANs and VAEs involve minimizing KL divergences of respective posterior and inference distributions with opposite directions, extending the two learning phases of classic wake-sleep algorithm, respectively. The unified view provides a powerful tool to analyze a diverse set of existing model variants, and enables to transfer techniques across research lines in a principled way. For example, we apply the importance weighting method in VAE literatures for improved GAN learning, and enhance VAEs with an adversarial mechanism that leverages generated samples. Experiments show generality and effectiveness of the transfered techniques. ", "keywords": "deep generative models;generative adversarial networks;variational autoencoders;variational inference", "primary_area": "", "supplementary_material": "", "author": "Zhiting Hu;Zichao Yang;Ruslan Salakhutdinov;Eric P. Xing", "authorids": "zhitinghu@gmail.com;yangtze2301@gmail.com;rsalakhu@cs.cmu.edu;epxing@cs.cmu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nhu2018on,\ntitle={On Unifying Deep Generative Models},\nauthor={Zhiting Hu and Zichao Yang and Ruslan Salakhutdinov and Eric P. Xing},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rylSzl-R-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;3;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 152, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2969749704593930195&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=rylSzl-R-", "pdf": "https://openreview.net/pdf?id=rylSzl-R-", "email": ";;;", "author_num": 4 }, { "id": "rylejExC-", "title": "Stochastic Training of Graph Convolutional Networks", "track": "main", "status": "Reject", "tldr": "A control variate based stochastic training algorithm for graph convolutional networks that the receptive field can be only two neighbors per node.", "abstract": "Graph convolutional networks (GCNs) are powerful deep neural networks for graph-structured data. However, GCN computes nodes' representation recursively from their neighbors, making the receptive field size grow exponentially with the number of layers. Previous attempts on reducing the receptive field size by subsampling neighbors do not have any convergence guarantee, and their receptive field size per node is still in the order of hundreds. In this paper, we develop a preprocessing strategy and two control variate based algorithms to further reduce the receptive field size. Our algorithms are guaranteed to converge to GCN's local optimum regardless of the neighbor sampling size. Empirical results show that our algorithms have a similar convergence speed per epoch with the exact algorithm even using only two neighbors per node. The time consumption of our algorithm on the Reddit dataset is only one fifth of previous neighbor sampling algorithms.", "keywords": "Graph convolutional networks;stochastic gradient descent;variance reduction;control variate", "primary_area": "", "supplementary_material": "", "author": "Jianfei Chen;Jun Zhu", "authorids": "chenjian14@mails.tsinghua.edu.cn;dcszj@mail.tsinghua.edu.cn", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nchen2018stochastic,\ntitle={Stochastic Training of Graph Convolutional Networks},\nauthor={Jianfei Chen and Jun Zhu},\nyear={2018},\nurl={https://openreview.net/forum?id=rylejExC-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rylejExC-", "pdf_size": 0, "rating": "3;4;7", "confidence": "4;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 2, "corr_rating_confidence": -0.9707253433941508, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13877419458271890332&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "rylfg-DNM", "title": "Anticipatory Asynchronous Advantage Actor-Critic (A4C): The power of Anticipation in Deep Reinforcement Learning", "track": "main", "status": "Withdraw", "tldr": "Anticipation improves convergence of deep reinforcement learning.", "abstract": "We propose to extend existing deep reinforcement learning (Deep RL) algorithms by allowing them to additionally choose sequences of actions as a part of their policy. This modification forces the network to anticipate the reward of action sequences, which, as we show, improves the exploration leading to better convergence. Our proposal is simple, flexible, and can be easily incorporated into any Deep RL framework. We show the power of our scheme by consistently outperforming the state-of-the-art GA3C algorithm on several popular Atari Games.", "keywords": "deep reinforcement learning;A3C;deep learning;Atari games", "primary_area": "", "supplementary_material": "", "author": "Xun Luan;Tharun Medini;Anshumali Shrivastava", "authorids": "xun.luan@rice.edu;trm3@rice.edu;anshumali@rice.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rylfg-DNM", "pdf_size": 0, "rating": "2;3;4", "confidence": "5;5;4", "rating_avg": 3.0, "confidence_avg": 4.666666666666667, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2hE2G16Pj_0J:scholar.google.com/&scioq=Anticipatory+Asynchronous+Advantage+Actor-Critic+(A4C):+The+power+of+Anticipation+in+Deep+Reinforcement+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rynniUpQM", "title": "Learning with Mental Imagery", "track": "main", "status": "Withdraw", "tldr": "Object instance recognition with adversarial autoencoders was performed with a novel 'mental image' target that is canonical representation of the input image.", "abstract": "In this paper, we propose deep convolutional generative adversarial networks (DCGAN) that learn to produce a 'mental image' of the input image as internal representation of a certain category of input data distribution. This mental image is what the DCGAN 'imagines' that the input image might look like under ideal conditions. The mental image contains a version of the input that is iconic, without any peculiarities that do not contribute to the ideal representation of the input data distribution within a category. A DCGAN learns this association by training an encoder to capture salient features from the original image and a decoder to convert salient features into its associated mental image representation. Our new approach, which we refer to as a Mental Image DCGAN (MIDCGAN), learns features that are useful for recognizing entire classes of objects, and that this in turn has the benefit of helping single and zero shot recognition. We demonstrate our approach on object instance recognition and handwritten digit recognition tasks.", "keywords": "Deep Learning;Adversarial Networks;Object Instance Recognition;Cognitive AI", "primary_area": "", "supplementary_material": "", "author": "Anonymous", "authorids": "ICLR.cc/2018/Conference/Paper839/Authors", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@article{\n anonymous2018learning,\n title={Learning with Mental Imagery},\n author={Anonymous},\n journal={International Conference on Learning Representations},\n year={2018},\n url={https://openreview.net/forum?id=HJY9Sf-0b}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rynniUpQM", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 3, "authors#_avg": 1, "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "LEARNING TO SHARE: SIMULTANEOUS PARAMETER TYING AND SPARSIFICATION IN DEEP LEARNING", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/27", "id": "rypT3fb0b", "author_site": "Dejiao Zhang, Haozhu Wang, Mario Figueiredo, Laura Balzano", "tldr": "We have proposed using the recent GrOWL regularizer for simultaneous parameter sparsity and tying in DNN learning. ", "abstract": "Deep neural networks (DNNs) usually contain millions, maybe billions, of parameters/weights, making both storage and computation very expensive. This has motivated a large body of work to reduce the complexity of the neural network by using sparsity-inducing regularizers. Another well-known approach for controlling the complexity of DNNs is parameter sharing/tying, where certain sets of weights are forced to share a common value. Some forms of weight sharing are hard-wired to express certain in- variances, with a notable example being the shift-invariance of convolutional layers. However, there may be other groups of weights that may be tied together during the learning process, thus further re- ducing the complexity of the network. In this paper, we adopt a recently proposed sparsity-inducing regularizer, named GrOWL (group ordered weighted l1), which encourages sparsity and, simulta- neously, learns which groups of parameters should share a common value. GrOWL has been proven effective in linear regression, being able to identify and cope with strongly correlated covariates. Unlike standard sparsity-inducing regularizers (e.g., l1 a.k.a. Lasso), GrOWL not only eliminates unimportant neurons by setting all the corresponding weights to zero, but also explicitly identifies strongly correlated neurons by tying the corresponding weights to a common value. This ability of GrOWL motivates the following two-stage procedure: (i) use GrOWL regularization in the training process to simultaneously identify significant neurons and groups of parameter that should be tied together; (ii) retrain the network, enforcing the structure that was unveiled in the previous phase, i.e., keeping only the significant neurons and enforcing the learned tying structure. We evaluate the proposed approach on several benchmark datasets, showing that it can dramatically compress the network with slight or even no loss on generalization performance.\n", "keywords": "Compressing neural network;simultaneously parameter tying and sparsification;group ordered l1 regularization", "primary_area": "", "supplementary_material": "", "author": "Dejiao Zhang;Haozhu Wang;Mario Figueiredo;Laura Balzano", "authorids": "dejiao@umich.edu;hzwang@umich.edu;mario.figueiredo@lx.it.pt;girasole@umich.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nzhang2018learning,\ntitle={{LEARNING} {TO} {SHARE}: {SIMULTANEOUS} {PARAMETER} {TYING} {AND} {SPARSIFICATION} {IN} {DEEP} {LEARNING}},\nauthor={Dejiao Zhang and Haozhu Wang and Mario Figueiredo and Laura Balzano},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rypT3fb0b},\n}", "github": "[![github](/images/github_icon.svg) Dejiao2018/GrOWL](https://github.com/Dejiao2018/GrOWL)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;4;5", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": 1.0, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2292611582498005860&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=rypT3fb0b", "pdf": "https://openreview.net/pdf?id=rypT3fb0b", "email": ";;;", "author_num": 4 }, { "id": "ryserbZR-", "title": "Classification and Disease Localization in Histopathology Using Only Global Labels: A Weakly-Supervised Approach", "track": "main", "status": "Reject", "tldr": "We propose a weakly supervised learning method for the classification and localization of cancers in extremely high resolution histopathology whole slide images using only image-wide labels.", "abstract": "Analysis of histopathology slides is a critical step for many diagnoses, and in particular in oncology where it defines the gold standard. In the case of digital histopathological analysis, highly trained pathologists must review vast whole-slide-images of extreme digital resolution (100,000^2 pixels) across multiple zoom levels in order to locate abnormal regions of cells, or in some cases single cells, out of millions. The application of deep learning to this problem is hampered not only by small sample sizes, as typical datasets contain only a few hundred samples, but also by the generation of ground-truth localized annotations for training interpretable classification and segmentation models. We propose a method for disease available during training. Even without pixel-level annotations, we are able to demonstrate performance comparable with models trained with strong annotations on the Camelyon-16 lymph node metastases detection challenge. We accomplish this through the use of pre-trained deep convolutional networks, feature embedding, as well as learning via top instances and negative evidence, a multiple instance learning technique fromatp the field of semantic segmentation and object detection.", "keywords": "Weakly Supervised Learning;Medical Imaging;Histopathology;Deep Feature Extraction", "primary_area": "", "supplementary_material": "", "author": "Pierre Courtiol;Eric W. Tramel;Marc Sanselme;Gilles Wainrib", "authorids": "pierre.courtiol@owkin.com;eric.tramel@owkin.com;marc.sanselme@owkin.com;gilles.wainrib@owkin.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ncourtiol2018classification,\ntitle={Classification and Disease Localization in Histopathology Using Only Global Labels: A Weakly-Supervised Approach},\nauthor={Pierre Courtiol and Eric W. Tramel and Marc Sanselme and Gilles Wainrib},\nyear={2018},\nurl={https://openreview.net/forum?id=ryserbZR-},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ryserbZR-", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 142, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3429619124969026005&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "title": "Training wide residual networks for deployment using a single bit for each weight", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/311", "id": "rytNfI1AZ", "author_site": "Mark D. McDonnell", "tldr": "We train wide residual networks that can be immediately deployed using only a single bit for each convolutional weight, with signficantly better accuracy than past methods.", "abstract": "For fast and energy-efficient deployment of trained deep neural networks on resource-constrained embedded hardware, each learned weight parameter should ideally be represented and stored using a single bit. Error-rates usually increase when this requirement is imposed. Here, we report large improvements in error rates on multiple datasets, for deep convolutional neural networks deployed with 1-bit-per-weight. Using wide residual networks as our main baseline, our approach simplifies existing methods that binarize weights by applying the sign function in training; we apply scaling factors for each layer with constant unlearned values equal to the layer-specific standard deviations used for initialization. For CIFAR-10, CIFAR-100 and ImageNet, and models with 1-bit-per-weight requiring less than 10 MB of parameter memory, we achieve error rates of 3.9%, 18.5% and 26.0% / 8.5% (Top-1 / Top-5) respectively. We also considered MNIST, SVHN and ImageNet32, achieving 1-bit-per-weight test results of 0.27%, 1.9%, and 41.3% / 19.1% respectively. For CIFAR, our error rates halve previously reported values, and are within about 1% of our error-rates for the same network with full-precision weights. For networks that overfit, we also show significant improvements in error rate by not learning batch normalization scale and offset parameters. This applies to both full precision and 1-bit-per-weight networks. Using a warm-restart learning-rate schedule, we found that training for 1-bit-per-weight is just as fast as full-precision networks, with better accuracy than standard schedules, and achieved about 98%-99% of peak performance in just 62 training epochs for CIFAR-10/100. For full training code and trained models in MATLAB, Keras and PyTorch see https://github.com/McDonnell-Lab/1-bit-per-weight/ .", "keywords": "wide residual networks;model compression;quantization;1-bit weights", "primary_area": "", "supplementary_material": "", "author": "Mark D. McDonnell", "authorids": "mark.mcdonnell@unisa.edu.au", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\nd.2018training,\ntitle={Training wide residual networks for deployment using a single bit for each weight},\nauthor={Mark D. McDonnell},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rytNfI1AZ},\n}", "github": "[![github](/images/github_icon.svg) McDonnell-Lab/1-bit-per-weight](https://github.com/McDonnell-Lab/1-bit-per-weight) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=rytNfI1AZ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;4;4", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 1, "corr_rating_confidence": 0.0, "gs_citation": 89, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7686605623349914581&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=rytNfI1AZ", "pdf": "https://openreview.net/pdf?id=rytNfI1AZ", "email": "", "author_num": 1 }, { "title": "FastGCN: Fast Learning with Graph Convolutional Networks via Importance Sampling", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/145", "id": "rytstxWAW", "author_site": "Jie Chen, Tengfei Ma, Cao Xiao", "tldr": "", "abstract": "The graph convolutional networks (GCN) recently proposed by Kipf and Welling are an effective graph model for semi-supervised learning. Such a model, however, is transductive in nature because parameters are learned through convolutions with both training and test data. Moreover, the recursive neighborhood expansion across layers poses time and memory challenges for training with large, dense graphs. To relax the requirement of simultaneous availability of test data, we interpret graph convolutions as integral transforms of embedding functions under probability measures. Such an interpretation allows for the use of Monte Carlo approaches to consistently estimate the integrals, which in turn leads to a batched training scheme as we propose in this work---FastGCN. Enhanced with importance sampling, FastGCN not only is efficient for training but also generalizes well for inference. We show a comprehensive set of experiments to demonstrate its effectiveness compared with GCN and related models. In particular, training is orders of magnitude more efficient while predictions remain comparably accurate.\n", "keywords": "Graph convolutional networks;importance sampling", "primary_area": "", "supplementary_material": "", "author": "Jie Chen;Tengfei Ma;Cao Xiao", "authorids": "chenjie@us.ibm.com;tengfei.ma1@ibm.com;cxiao@us.ibm.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nchen2018fastgcn,\ntitle={Fast{GCN}: Fast Learning with Graph Convolutional Networks via Importance Sampling},\nauthor={Jie Chen and Tengfei Ma and Cao Xiao},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rytstxWAW},\n}", "github": "[![github](/images/github_icon.svg) matenure/FastGCN](https://github.com/matenure/FastGCN) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=rytstxWAW)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1;AnonReviewer4", "pdf_size": 0, "rating": "6;7;7;8", "confidence": "4;2;4;4", "rating_avg": 7.0, "confidence_avg": 3.5, "replies_avg": 28, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 1976, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18054036108684442257&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=rytstxWAW", "pdf": "https://openreview.net/pdf?id=rytstxWAW", "email": ";;", "author_num": 3 }, { "title": "Measuring the Intrinsic Dimension of Objective Landscapes", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/122", "id": "ryup8-WCW", "author_site": "Chunyuan Li, Heerad Farkhoor, Ruoqian Liu, Jason Yosinski", "tldr": "We train in random subspaces of parameter space to measure how many dimensions are really needed to find a solution.", "abstract": "Many recently trained neural networks employ large numbers of parameters to achieve good performance. One may intuitively use the number of parameters required as a rough gauge of the difficulty of a problem. But how accurate are such notions? How many parameters are really needed? In this paper we attempt to answer this question by training networks not in their native parameter space, but instead in a smaller, randomly oriented subspace. We slowly increase the dimension of this subspace, note at which dimension solutions first appear, and define this to be the intrinsic dimension of the objective landscape. The approach is simple to implement, computationally tractable, and produces several suggestive conclusions. Many problems have smaller intrinsic dimensions than one might suspect, and the intrinsic dimension for a given dataset varies little across a family of models with vastly different sizes. This latter result has the profound implication that once a parameter space is large enough to solve a problem, extra parameters serve directly to increase the dimensionality of the solution manifold. Intrinsic dimension allows some quantitative comparison of problem difficulty across supervised, reinforcement, and other types of learning where we conclude, for example, that solving the inverted pendulum problem is 100 times easier than classifying digits from MNIST, and playing Atari Pong from pixels is about as hard as classifying CIFAR-10. In addition to providing new cartography of the objective landscapes wandered by parameterized models, the method is a simple technique for constructively obtaining an upper bound on the minimum description length of a solution. A byproduct of this construction is a simple approach for compressing networks, in some cases by more than 100 times.", "keywords": "machine learning;neural networks;intrinsic dimension;random subspace;model understanding", "primary_area": "", "supplementary_material": "", "author": "Chunyuan Li;Heerad Farkhoor;Rosanne Liu;Jason Yosinski", "authorids": "chunyuan.li@duke.edu;heerad@uber.com;rosanne@uber.com;jason@yosinski.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nli2018measuring,\ntitle={Measuring the Intrinsic Dimension of Objective Landscapes},\nauthor={Chunyuan Li and Heerad Farkhoor and Rosanne Liu and Jason Yosinski},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=ryup8-WCW},\n}", "github": "[![github](/images/github_icon.svg) uber-research/intrinsic-dimension](https://github.com/uber-research/intrinsic-dimension) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=ryup8-WCW)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "2;3;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 482, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17182266159657033387&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=ryup8-WCW", "pdf": "https://openreview.net/pdf?id=ryup8-WCW", "email": ";;;", "author_num": 4 }, { "id": "ryvxcPeAb", "title": "Enhancing the Transferability of Adversarial Examples with Noise Reduced Gradient", "track": "main", "status": "Reject", "tldr": "We propose a new method for enhancing the transferability of adversarial examples by using the noise-reduced gradient.", "abstract": "Deep neural networks provide state-of-the-art performance for many applications of interest. Unfortunately they are known to be vulnerable to adversarial examples, formed by applying small but malicious perturbations to the original inputs. Moreover, the perturbations can transfer across models: adversarial examples generated for a specific model will often mislead other unseen models. Consequently the adversary can leverage it to attack against the deployed black-box systems. \nIn this work, we demonstrate that the adversarial perturbation can be decomposed into two components: model-specific and data-dependent one, and it is the latter that mainly contributes to the transferability. Motivated by this understanding, we propose to craft adversarial examples by utilizing the noise reduced gradient (NRG) which approximates the data-dependent component. Experiments on various classification models trained on ImageNet demonstrates that the new approach enhances the transferability dramatically. We also find that low-capacity models have more powerful attack capability than high-capacity counterparts, under the condition that they have comparable test performance. These insights give rise to a principled manner to construct adversarial examples with high success rates and could potentially provide us guidance for designing effective defense approaches against black-box attacks. ", "keywords": "black-box attack;adversarial example;deep learning;transferability", "primary_area": "", "supplementary_material": "", "author": "Lei Wu;Zhanxing Zhu;Cheng Tai;Weinan E", "authorids": "leiwu@pku.edu.cn;zhanxing.zhu@pku.edu.cn;chengt@math.princeton.edu;weinan@math.princeton.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nwu2018enhancing,\ntitle={Enhancing the Transferability of Adversarial Examples with Noise Reduced Gradient},\nauthor={Lei Wu and Zhanxing Zhu and Cheng Tai and Weinan E},\nyear={2018},\nurl={https://openreview.net/forum?id=ryvxcPeAb},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ryvxcPeAb", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;3;4", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2464323890757693135&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Neural-Guided Deductive Search for Real-Time Program Synthesis from Examples", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/141", "id": "rywDjg-RW", "author_site": "Ashwin Vijayakumar, Abhishek Mohta, Alex Polozov, Dhruv Batra, Prateek Jain, Sumit Gulwani", "tldr": "We integrate symbolic (deductive) and statistical (neural-based) methods to enable real-time program synthesis with almost perfect generalization from 1 input-output example.", "abstract": "Synthesizing user-intended programs from a small number of input-output exam-\nples is a challenging problem with several important applications like spreadsheet\nmanipulation, data wrangling and code refactoring. Existing synthesis systems\neither completely rely on deductive logic techniques that are extensively hand-\nengineered or on purely statistical models that need massive amounts of data, and in\ngeneral fail to provide real-time synthesis on challenging benchmarks. In this work,\nwe propose Neural Guided Deductive Search (NGDS), a hybrid synthesis technique\nthat combines the best of both symbolic logic techniques and statistical models.\nThus, it produces programs that satisfy the provided specifications by construction\nand generalize well on unseen examples, similar to data-driven systems. Our\ntechnique effectively utilizes the deductive search framework to reduce the learning\nproblem of the neural component to a simple supervised learning setup. Further,\nthis allows us to both train on sparingly available real-world data and still leverage\npowerful recurrent neural network encoders. We demonstrate the effectiveness\nof our method by evaluating on real-world customer scenarios by synthesizing\naccurate programs with up to 12\u00d7 speed-up compared to state-of-the-art systems.", "keywords": "Program synthesis;deductive search;deep learning;program induction;recurrent neural networks", "primary_area": "", "supplementary_material": "", "author": "Ashwin Kalyan;Abhishek Mohta;Oleksandr Polozov;Dhruv Batra;Prateek Jain;Sumit Gulwani", "authorids": "ashwinkv@gatech.edu;t-abmoht@microsoft.com;polozov@microsoft.com;dbatra@gatech.edu;prajain@microsoft.com;sumitg@microsoft.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nkalyan2018neuralguided,\ntitle={Neural-Guided Deductive Search for Real-Time Program Synthesis from Examples},\nauthor={Ashwin Kalyan and Abhishek Mohta and Oleksandr Polozov and Dhruv Batra and Prateek Jain and Sumit Gulwani},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rywDjg-RW},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;8", "confidence": "3;4;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 9, "authors#_avg": 6, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 190, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6666411294628297468&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=rywDjg-RW", "pdf": "https://openreview.net/pdf?id=rywDjg-RW", "email": ";;;;;", "author_num": 6 }, { "title": "Noisy Networks For Exploration", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2018/poster/308", "id": "rywHCPkAW", "author_site": "Meire Fortunato, Mohammad Gheshlaghi Azar, Bilal Piot, Jacob Menick, Matteo Hessel, Ian Osband, Alex Graves, Volodymyr Mnih, Remi Munos, Demis Hassabis, Olivier Pietquin, Charles Blundell, Shane Legg", "tldr": "A deep reinforcement learning agent with parametric noise added to its weights can be used to aid efficient exploration.", "abstract": "We introduce NoisyNet, a deep reinforcement learning agent with parametric noise added to its weights, and show that the induced stochasticity of the agent\u2019s policy can be used to aid efficient exploration. The parameters of the noise are learned with gradient descent along with the remaining network weights. NoisyNet is straightforward to implement and adds little computational overhead. We find that replacing the conventional exploration heuristics for A3C, DQN and Dueling agents (entropy reward and epsilon-greedy respectively) with NoisyNet yields substantially higher scores for a wide range of Atari games, in some cases advancing the agent from sub to super-human performance.", "keywords": "Deep Reinforcement Learning;Exploration;Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Meire Fortunato;Mohammad Gheshlaghi Azar;Bilal Piot;Jacob Menick;Matteo Hessel;Ian Osband;Alex Graves;Volodymyr Mnih;Remi Munos;Demis Hassabis;Olivier Pietquin;Charles Blundell;Shane Legg", "authorids": "meirefortunato@google.com;mazar@google.com;piot@google.com;jmenick@google.com;mtthss@google.com;iosband@google.com;gravesa@google.com;vmnih@google.com;munos@google.com;dhcontact@google.com;pietquin@google.com;cblundell@google.com;legg@google.com", "gender": ";;;;;;;;;;;;", "homepage": ";;;;;;;;;;;;", "dblp": ";;;;;;;;;;;;", "google_scholar": ";;;;;;;;;;;;", "orcid": ";;;;;;;;;;;;", "linkedin": ";;;;;;;;;;;;", "or_profile": ";;;;;;;;;;;;", "aff": ";;;;;;;;;;;;", "aff_domain": ";;;;;;;;;;;;", "position": ";;;;;;;;;;;;", "bibtex": "@inproceedings{\nfortunato2018noisy,\ntitle={Noisy Networks For Exploration},\nauthor={Meire Fortunato and Mohammad Gheshlaghi Azar and Bilal Piot and Jacob Menick and Matteo Hessel and Ian Osband and Alex Graves and Volodymyr Mnih and Remi Munos and Demis Hassabis and Olivier Pietquin and Charles Blundell and Shane Legg},\nbooktitle={International Conference on Learning Representations},\nyear={2018},\nurl={https://openreview.net/forum?id=rywHCPkAW},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 15 community implementations](https://paperswithcode.com/paper/?openreview=rywHCPkAW)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;6;7", "confidence": "3;4;4", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 15, "authors#_avg": 13, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 1259, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13916735202249031707&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=rywHCPkAW", "pdf": "https://openreview.net/pdf?id=rywHCPkAW", "email": ";;;;;;;;;;;;", "author_num": 13 }, { "id": "ryykVe-0W", "title": "Learning Independent Features with Adversarial Nets for Non-linear ICA", "track": "main", "status": "Reject", "tldr": "", "abstract": "Reliable measures of statistical dependence could potentially be useful tools for learning independent features and performing tasks like source separation using Independent Component Analysis (ICA). Unfortunately, many of such measures, like the mutual information, are hard to estimate and optimize directly. We propose to learn independent features with adversarial objectives (Goodfellow et al. 2014, Arjovsky et al. 2017) which optimize such measures implicitly. These objectives compare samples from the joint distribution and the product of the marginals without the need to compute any probability densities. We also propose two methods for obtaining samples from the product of the marginals using either a simple resampling trick or a separate parametric distribution. Our experiments show that this strategy can easily be applied to different types of model architectures and solve both linear and non-linear ICA problems.\n", "keywords": "adversarial networks;ica;unsupervised;independence", "primary_area": "", "supplementary_material": "", "author": "Philemon Brakel;Yoshua Bengio", "authorids": "pbpop3@gmail.com;yoshua.bengio@umontreal.ca", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nbrakel2018learning,\ntitle={Learning Independent Features with Adversarial Nets for Non-linear {ICA}},\nauthor={Philemon Brakel and Yoshua Bengio},\nyear={2018},\nurl={https://openreview.net/forum?id=ryykVe-0W},\n}", "github": "[![github](/images/github_icon.svg) pbrakel/anica](https://github.com/pbrakel/anica)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ryykVe-0W", "pdf_size": 0, "rating": "3;5;6", "confidence": "5;5;3", "rating_avg": 4.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": -0.7559289460184545, "gs_citation": 106, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16000486067906297035&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "ryzm6BATZ", "title": "Image Quality Assessment Techniques Improve Training and Evaluation of Energy-Based Generative Adversarial Networks", "track": "main", "status": "Reject", "tldr": "Image Quality Assessment Techniques Improve Training and Evaluation of Energy-Based Generative Adversarial Networks", "abstract": "We propose a new, multi-component energy function for energy-based Generative Adversarial Networks (GANs) based on methods from the image quality assessment literature. Our approach expands on the Boundary Equilibrium Generative Adversarial Network (BEGAN) by outlining some of the short-comings of the original energy and loss functions. We address these short-comings by incorporating an l1 score, the Gradient Magnitude Similarity score, and a chrominance score into the new energy function. We then provide a set of systematic experiments that explore its hyper-parameters. We show that each of the energy function's components is able to represent a slightly different set of features, which require their own evaluation criteria to assess whether they have been adequately learned. We show that models using the new energy function are able to produce better image representations than the BEGAN model in predicted ways.", "keywords": "generative adversarial networks;gans;deep learning;image modeling;image generation;energy based models", "primary_area": "", "supplementary_material": "", "author": "Michael O. Vertolli;Jim Davies", "authorids": "michaelvertolli@gmail.com;jim@jimdavies.org", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\no.2018image,\ntitle={Image Quality Assessment Techniques Improve Training and Evaluation of Energy-Based Generative Adversarial Networks},\nauthor={Michael O. Vertolli and Jim Davies},\nyear={2018},\nurl={https://openreview.net/forum?id=ryzm6BATZ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ryzm6BATZ", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;3;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.0, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RK_oi49vSFkJ:scholar.google.com/&scioq=Image+Quality+Assessment+Techniques+Improve+Training+and+Evaluation+of+Energy-Based+Generative+Adversarial+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0 } ]